Initial source drop of ocltst

This only adds source files for ocltst and the following test modules - oclruntime, oclperf, oclgl, ocldx. There's no build files for now. Change-Id: I0f8d9d074c45d82e92f7d30bf22753102f272f4f [ROCm/clr commit: 75e6add24d]
2020-05-29 12:10:04 -04:00
commit 18ce996fe2
@@ -0,0 +1,54 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef OCL_TEST_MODULE_H
+#define OCL_TEST_MODULE_H
+
+#include <string>
+
+#include "OCLTest.h"
+#include "OCLTestList.h"
+
+struct Module {
+  std::string name;
+  ModuleHandle hmodule;
+  TestCountFuncPtr get_count;
+  TestNameFuncPtr get_name;
+  CreateTestFuncPtr create_test;
+  DestroyTestFuncPtr destroy_test;
+  TestVersionFuncPtr get_version;
+  TestLibNameFuncPtr get_libname;
+  OCLTest** cached_test;
+
+  Module()
+      : name(""),
+        hmodule(0),
+        get_count(0),
+        get_name(0),
+        create_test(0),
+        destroy_test(0),
+        get_version(0),
+        get_libname(0),
+        cached_test(0) {
+    // EMPTY!
+  }
+};
+
+#endif  // OCL_TEST_MODULE_H
@@ -0,0 +1,71 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _RESULT_STRUCT_H_
+
+struct IndicesRange {
+  int startIndex;
+  int endIndex;
+};
+
+#define INDEX_ALL_TESTS -1
+#define EXTREMELY_SMALL_VALUE -10000.0f
+#define EXTREMELY_LARGE_VALUE 10000.0f
+
+class TestResult {
+ public:
+  float value;
+  std::string resultString;
+  bool passed;
+
+  TestResult(float val) : resultString("\n"), passed(true) { value = val; }
+
+  void reset(float val) {
+    value = val;
+    passed = true;
+    resultString.assign("\n");
+  }
+};
+
+class Report {
+ public:
+  TestResult *max;
+  TestResult *min;
+  bool success;
+  int numFailedTests;
+
+  Report() : success(true), numFailedTests(0) {
+    max = new TestResult(EXTREMELY_SMALL_VALUE);
+    min = new TestResult(EXTREMELY_LARGE_VALUE);
+  }
+
+  void reset() {
+    max->reset(EXTREMELY_SMALL_VALUE);
+    min->reset(EXTREMELY_LARGE_VALUE);
+    success = true;
+    numFailedTests = 0;
+  }
+  ~Report() {
+    delete max;
+    delete min;
+  }
+};
+
+#endif  // _RESULT_STRUCT_H_
@@ -0,0 +1,111 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "Timer.h"
+
+#ifdef ATI_OS_WIN
+#include <windows.h>
+#endif
+
+#ifdef ATI_OS_LINUX
+#include <sys/time.h>
+#endif
+
+CPerfCounter::CPerfCounter() : _clocks(0), _start(0) {
+#ifdef ATI_OS_WIN
+
+  QueryPerformanceFrequency((LARGE_INTEGER *)&_freq);
+
+#endif
+
+#ifdef ATI_OS_LINUX
+  _freq = 1000;
+#endif
+}
+
+CPerfCounter::~CPerfCounter() {
+  // EMPTY!
+}
+
+void CPerfCounter::Start(void) {
+#ifdef ATI_OS_WIN
+
+  if (_start) {
+    MessageBox(NULL, "Bad Perf Counter Start", "Error", MB_OK);
+    exit(0);
+  }
+  QueryPerformanceCounter((LARGE_INTEGER *)&_start);
+
+#endif
+#ifdef ATI_OS_LINUX
+
+  struct timeval s;
+  gettimeofday(&s, 0);
+  _start = (i64)s.tv_sec * 1000 + (i64)s.tv_usec / 1000;
+
+#endif
+}
+
+void CPerfCounter::Stop(void) {
+  i64 n;
+
+#ifdef ATI_OS_WIN
+
+  if (!_start) {
+    MessageBox(NULL, "Bad Perf Counter Stop", "Error", MB_OK);
+    exit(0);
+  }
+
+  QueryPerformanceCounter((LARGE_INTEGER *)&n);
+
+#endif
+#ifdef ATI_OS_LINUX
+
+  struct timeval s;
+  gettimeofday(&s, 0);
+  n = (i64)s.tv_sec * 1000 + (i64)s.tv_usec / 1000;
+
+#endif
+
+  n -= _start;
+  _start = 0;
+  _clocks += n;
+}
+
+void CPerfCounter::Reset(void) {
+#ifdef ATI_OS_WIN
+  if (_start) {
+    MessageBox(NULL, "Bad Perf Counter Reset", "Error", MB_OK);
+    exit(0);
+  }
+#endif
+  _clocks = 0;
+}
+
+double CPerfCounter::GetElapsedTime(void) {
+#ifdef ATI_OS_WIN
+  if (_start) {
+    MessageBox(NULL, "Trying to get time while still running.", "Error", MB_OK);
+    exit(0);
+  }
+#endif
+
+  return (double)_clocks / (double)_freq;
+}
@@ -0,0 +1,46 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _TIMER_H_
+#define _TIMER_H_
+
+#ifdef ATI_OS_WIN
+typedef __int64 i64;
+#endif
+#ifdef ATI_OS_LINUX
+typedef long long i64;
+#endif
+
+class CPerfCounter {
+ public:
+  CPerfCounter();
+  ~CPerfCounter();
+  void Start(void);
+  void Stop(void);
+  void Reset(void);
+  double GetElapsedTime(void);
+
+ private:
+  i64 _freq;
+  i64 _clocks;
+  i64 _start;
+};
+
+#endif  // _TIMER_H_
@@ -0,0 +1,180 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef OCL_TEST_WORKER_H
+#define OCL_TEST_WORKER_H
+
+/////////////////////////////////////////////////////////////////////////////
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <cstring>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "Module.h"
+#include "OCLTest.h"
+#include "OCLTestList.h"
+#include "ResultStruct.h"
+#include "Timer.h"
+#include "getopt.h"
+#include "pfm.h"
+
+/////////////////////////////////////////////////////////////////////////////
+
+typedef void* (*TestMethod)(void* param);
+
+/////////////////////////////////////////////////////////////////////////////
+
+class Worker {
+ public:
+  Worker()
+      : m_wrapper(0),
+        m_module(0),
+        m_run(0),
+        m_id(0),
+        m_subtest(0),
+        m_testindex(0),
+        m_dump(false),
+        m_display(false),
+        m_useCPU(false),
+        m_window(0),
+        m_width(0),
+        m_height(0),
+        m_buffer(0),
+        m_perflab(false),
+        m_deviceId(0),
+        m_platform(0) {
+    // EMPTY!
+  }
+
+  Worker(OCLWrapper* wrapper, Module* module, TestMethod run, unsigned int id,
+         unsigned int subtest, unsigned int testindex, bool dump, bool view,
+         bool useCPU, void* window, unsigned int x, unsigned int y,
+         bool perflab, unsigned int deviceId = 0, unsigned int platform = 0)
+      : m_wrapper(wrapper),
+        m_module(module),
+        m_run(run),
+        m_id(id),
+        m_subtest(subtest),
+        m_testindex(testindex),
+        m_dump(dump),
+        m_display(view),
+        m_useCPU(useCPU),
+        m_window(window),
+        m_width(x),
+        m_height(y),
+        m_buffer(0),
+        m_perflab(perflab),
+        m_deviceId(deviceId),
+        m_platform(platform) {
+    if (m_dump == true || m_display == true) {
+      m_buffer = new float[4 * m_width * m_height];
+      if (m_buffer != 0) {
+        memset(m_buffer, 0, 4 * m_width * m_height * sizeof(float));
+      } else {
+        m_dump = false;
+        m_display = false;
+      }
+    }
+    m_result = new TestResult(0.0f);
+  }
+
+  Worker(const Worker& w) {
+    if (this == &w) return;
+
+    if (m_buffer) delete[] m_buffer;
+    m_buffer = 0;
+
+    m_wrapper = w.m_wrapper;
+    m_module = w.m_module;
+    m_run = w.m_run;
+    m_id = w.m_id;
+    m_subtest = w.m_subtest;
+    m_testindex = w.m_testindex;
+    m_dump = w.m_dump;
+    m_display = w.m_display;
+    m_useCPU = w.m_useCPU;
+    m_window = w.m_window;
+    m_width = w.m_width;
+    m_height = w.m_height;
+    m_perflab = w.m_perflab;
+    m_deviceId = w.m_deviceId;
+    m_result = w.m_result;
+    m_platform = w.m_platform;
+
+    if (w.m_buffer) {
+      m_buffer = new float[4 * m_width * m_height];
+      if (m_buffer != 0) {
+        memcpy(m_buffer, w.m_buffer, 4 * m_width * m_height * sizeof(float));
+      }
+    }
+  }
+
+  ~Worker() {
+    if (m_buffer) delete[] m_buffer;
+    m_buffer = 0;
+    delete m_result;
+    m_result = 0;
+  }
+
+  OCLWrapper* getOCLWrapper() { return m_wrapper; }
+  Module* getModule() { return m_module; }
+  TestMethod getTestMethod() { return m_run; }
+  unsigned int getId() { return m_id; }
+  unsigned int getSubTest() { return m_subtest; }
+  unsigned int getTestIndex() { return m_testindex; }
+  bool isDumpEnabled() { return m_dump; }
+  bool isDisplayEnabled() { return m_display; }
+  bool isCPUEnabled() { return m_useCPU; }
+  void* getWindow() { return m_window; }
+  unsigned int getWidth() { return m_width; }
+  unsigned int getHeight() { return m_height; }
+  float* getBuffer() { return m_buffer; }
+  bool getPerflab() { return m_perflab; }
+  unsigned int getDeviceId() { return m_deviceId; }
+  TestResult* getResult() { return m_result; }
+  unsigned int getPlatformID() { return m_platform; }
+
+ private:
+  OCLWrapper* m_wrapper;
+  Module* m_module;
+  TestMethod m_run;
+  unsigned int m_id;
+  unsigned int m_subtest;
+  unsigned int m_testindex;
+  bool m_dump;
+  bool m_display;
+  bool m_useCPU;
+  void* m_window;
+  unsigned int m_width;
+  unsigned int m_height;
+  float* m_buffer;
+  bool m_perflab;
+  unsigned int m_deviceId;
+  unsigned int m_platform;
+  TestResult* m_result;
+};
+
+/////////////////////////////////////////////////////////////////////////////
+
+#endif  // OCL_TEST_WORKER_H
@@ -0,0 +1,162 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "oclsysinfo.h"
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+
+#include <cstdio>
+
+#ifndef MAX_DEVICES
+#define MAX_DEVICES 16
+#endif  // MAX_DEVICES
+
+int oclSysInfo(std::string &info_string, bool use_cpu, unsigned dev_id,
+               unsigned int platformIndex) {
+  /*
+   * Have a look at the available platforms and pick the one
+   * in the platforms vector in index "platformIndex".
+   */
+
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+
+  int error = clGetPlatformIDs(0, NULL, &numPlatforms);
+  if (CL_SUCCESS != error) {
+    fprintf(stderr, "clGetPlatformIDs() failed");
+    return 0;
+  }
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error = clGetPlatformIDs(numPlatforms, platforms, NULL);
+    if (CL_SUCCESS != error) {
+      fprintf(stderr, "clGetPlatformIDs() failed");
+      return 0;
+    }
+#if 0
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+            /* Get the number of requested devices */
+            error = clGetDeviceIDs(platforms[i],  (use_cpu) ? CL_DEVICE_TYPE_CPU : CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices );
+#if 0
+            /* clGetDeviceIDs fails when no GPU devices are present */
+            if (error) {
+              fprintf(stderr, "clGetDeviceIDs failed: %d\n", error );
+              return 0;
+            }
+#endif
+#if 0
+            char pbuf[100];
+
+            error = clGetPlatformInfo(
+                         platforms[i],
+                         CL_PLATFORM_VENDOR,
+                         sizeof(pbuf),
+                         pbuf,
+                         NULL);
+            if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+                platform = platforms[i];
+                break;
+            }
+#else
+            /* Select platform with GPU devices  present */
+            if (num_devices > 0) {
+                platform = platforms[i];
+                break;
+            }
+#endif
+		}
+#endif
+    error = clGetDeviceIDs(platforms[platformIndex],
+                           (use_cpu) ? CL_DEVICE_TYPE_CPU : CL_DEVICE_TYPE_GPU,
+                           0, NULL, &num_devices);
+    if (error) {
+      fprintf(stderr, "clGetDeviceIDs failed: %d\n", error);
+      return 0;
+    }
+    platform = platforms[platformIndex];
+    delete[] platforms;
+  }
+  if (dev_id >= num_devices) {
+    fprintf(stderr, "Device selected does not exist.\n");
+    return 0;
+  }
+  if (NULL == platform) {
+    fprintf(stderr,
+            "Couldn't find platform with GPU devices, cannot proceed.\n");
+    return 0;
+  }
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  if (!devices) {
+    fprintf(stderr, "no devices\n");
+    return 0;
+  }
+
+  /* Get the requested device */
+  error = clGetDeviceIDs(platform,
+                         (use_cpu) ? CL_DEVICE_TYPE_CPU : CL_DEVICE_TYPE_GPU,
+                         num_devices, devices, NULL);
+  if (error) {
+    fprintf(stderr, "clGetDeviceIDs failed: %d\n", error);
+    return 0;
+  }
+
+  device = devices[dev_id];
+
+  char c[1024];
+  char tmpString[256];
+  static const char *no_yes[] = {"NO", "YES"};
+  sprintf(tmpString, "\nCompute Device info:\n");
+  info_string.append(tmpString);
+  clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(c), &c, NULL);
+  sprintf(tmpString, "\tPlatform Version: %s\n", c);
+  info_string.append(tmpString);
+  clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(c), &c, NULL);
+  sprintf(tmpString, "\tDevice Name: %s\n", c);
+  info_string.append(tmpString);
+  clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(c), &c, NULL);
+  sprintf(tmpString, "\tVendor: %s\n", c);
+  info_string.append(tmpString);
+  clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(c), &c, NULL);
+  sprintf(tmpString, "\tDevice Version: %s\n", c);
+  info_string.append(tmpString);
+  clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(c), &c, NULL);
+  sprintf(tmpString, "\tDriver Version: %s\n", c);
+  info_string.append(tmpString);
+  clGetDeviceInfo(device, CL_DEVICE_BOARD_NAME_AMD, sizeof(c), &c, NULL);
+  sprintf(tmpString, "\tBoard Name: %s\n", c);
+  info_string.append(tmpString);
+#if defined(ATI_OS_LINUX)
+  cl_device_topology_amd topology;
+  clGetDeviceInfo(device, CL_DEVICE_TOPOLOGY_AMD, sizeof(topology), &topology,
+                  NULL);
+  if (topology.raw.type == CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD) {
+    sprintf(tmpString, "\tDevice Topology: PCI[ B#%d, D#%d, F#%d]\n",
+            topology.pcie.bus, topology.pcie.device, topology.pcie.function);
+    info_string.append(tmpString);
+  }
+#endif
+  free(devices);
+  return 1;
+}
@@ -0,0 +1,28 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCLSYSINFO_H_
+#define _OCLSYSINFO_H_
+#include <string>
+
+int oclSysInfo(std::string& info_string, bool useCPU, unsigned dev_id,
+               unsigned int platformIndex = 0);
+
+#endif  //_OCLSYSINFO_H_
@@ -0,0 +1,79 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "pfm.h"
+
+#ifdef ATI_OS_WIN
+#include <io.h>
+#endif
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+unsigned int SavePFM(const char* filename, const float* buffer,
+                     unsigned int width, unsigned int height,
+                     unsigned int components) {
+  unsigned int error = 0;
+
+  //
+  // open the image file for writing
+  //
+  FILE* fh;
+  if ((fh = fopen(filename, "wb")) == NULL) {
+    return 1;
+  }
+
+  //
+  // write the PFM header
+  //
+#define PFMEOL "\x0a"
+  fprintf(fh, "PF" PFMEOL "%d %d" PFMEOL "-1" PFMEOL, width, height);
+  fflush(fh);
+
+  //
+  // write each scanline
+  //
+  const unsigned int lineSize = width * 3;
+  float line[3 * 4096];
+  for (unsigned int y = height; y > 0; y--) {
+    const float* v = buffer + components * width * (y - 1);
+    for (unsigned int x = 0; x < width; x++) {
+      line[x * 3 + 0] = v[x * components + 0];
+      line[x * 3 + 1] =
+          (components > 1) ? v[x * components + 1] : v[x * components + 0];
+      line[x * 3 + 2] =
+          (components > 2) ? v[x * components + 2] : v[x * components + 0];
+    }
+    unsigned int written =
+        (unsigned int)fwrite(line, (unsigned int)sizeof(float), lineSize, fh);
+    if (written != lineSize) {
+      error = 1;
+      break;
+    }
+    fflush(fh);
+  }
+  fflush(fh);
+  fclose(fh);
+
+  return error;
+}
@@ -0,0 +1,28 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _PFM_H_
+#define _PFM_H_
+
+extern unsigned int SavePFM(const char* filename, const float* buffer,
+                            unsigned int width, unsigned int height,
+                            unsigned int components);
+
+#endif  // _PFM_H_
@@ -0,0 +1,148 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef OCL_THREAD_H
+#define OCL_THREAD_H
+
+//!
+//! \file Thread.h
+//!
+
+#ifdef ATI_OS_WIN
+#ifndef _WIN32_WINNT
+#define _WIN32_WINNT 0x0501
+#endif
+
+#include "windows.h"
+#else
+#include "pthread.h"
+#endif
+
+//! Entry point for the thread
+//! prototype of the entry point in windows
+typedef void *(*oclThreadFunc)(void *);
+
+namespace OCLutil {
+//! \class Lock
+//! \brief Provides a wrapper for locking primitives used to
+//!  synchronize _CPU_ threads.
+//!
+//! Common usage would be:
+//!
+//!    OCL::Lock lock;
+//!
+//!    ....
+//!
+//!    // Critical section begins
+//!
+//!    lock.lock();
+//!
+//!    .....
+//!
+//!    // Critical section ends
+//!
+//!    lock.unlock();
+//!
+
+class Lock {
+ public:
+  //! Constructor for OCLLock
+  Lock();
+
+  //! Destructor for OCLLock
+  ~Lock();
+
+  //! Try to acquire the lock, if available continue, else wait on the lock
+  void lock();
+
+  //! Try to acquire the lock, if available, hold it, else continue doing
+  //! something else
+  bool tryLock();
+
+  //! Unlock the lock and return
+  void unlock();
+
+ private:
+  /////////////////////////////////////////////////////////////
+  //!
+  //! Private data members and methods
+  //!
+
+  //! System specific synchronization primitive
+#ifdef ATI_OS_WIN
+  CRITICAL_SECTION _cs;
+#else
+  pthread_mutex_t _lock;
+#endif
+};
+
+//////////////////////////////////////////////////////////////
+//!
+//! \class Thread
+//! \brief Provides a wrapper for creating a _CPU_ thread.
+//!
+//! This class provides a simple wrapper to a CPU thread/
+//! The class name might be a bit confusing, esp considering
+//! the GPU has it's own threads as well.
+//!
+class Thread {
+ public:
+  //! Thread constructor and destructor. Note that the thread is
+  //! NOT created in the constructor. The thread creation takes
+  //! place in the create method
+  Thread();
+
+  ~Thread();
+
+  //! Wrapper for pthread_create. Pass the thread's entry
+  //! point and data to be passed to the routine
+  bool create(oclThreadFunc func, void *arg);
+
+  //! Wrapper for pthread_join. The calling thread
+  //! will wait until _this_ thread exits
+  bool join();
+
+  //! Get the thread data passed by the application
+  void *getData() { return _data; }
+
+  //! Get the thread ID
+  static unsigned int getID();
+
+ private:
+  /////////////////////////////////////////////////////////////
+  //!
+  //! Private data members and methods
+  //!
+
+#ifdef ATI_OS_WIN
+  //!  store the handle
+  HANDLE _tid;
+
+  unsigned int _ID;
+#else
+  pthread_t _tid;
+
+  pthread_attr_t _attr;
+#endif
+
+  void *_data;
+};
+};  // namespace OCLutil
+#endif
@@ -0,0 +1,47 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef OCLLOG_H_
+#define OCLLOG_H_
+
+#ifdef ATI_OS_WIN
+
+#ifdef OCLTST_LOG_BUILD
+#define DLLIMPORT __declspec(dllexport)
+#else
+#define DLLIMPORT __declspec(dllimport)
+#endif  // OCLTST_ENV_BUILD
+
+#else
+#define DLLIMPORT
+
+#endif  // ATI_OS_WIN
+
+enum oclLoggingLevel {
+  OCLTEST_LOG_ALWAYS,
+  OCLTEST_LOG_VERBOSE,
+};
+
+extern DLLIMPORT void oclTestLog(oclLoggingLevel logLevel, const char* fmt,
+                                 ...);
+extern DLLIMPORT void oclTestSetLogLevel(int level);
+extern DLLIMPORT void oclTestEnableLogToFile(const char* filename);
+
+#endif  // OCLLOG_H_
@@ -0,0 +1,73 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCLTEST_H_
+#define _OCLTEST_H_
+
+#include <string>
+
+#include "OCLWrapper.h"
+
+class BaseTestImp;
+class OCLTestImp;
+class OCLTest {
+ public:
+  virtual unsigned int getThreadUsage(void) = 0;
+  virtual int getNumSubTests(void) = 0;
+  virtual void open() = 0;
+  virtual void open(unsigned int test, const char* deviceName,
+                    unsigned int architecture) = 0;
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId, unsigned int platformIndex) = 0;
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId) = 0;
+
+  virtual void run(void) = 0;
+  virtual unsigned int close(void) = 0;
+  virtual void setErrorMsg(const char* error) = 0;
+  virtual const char* getErrorMsg(void) = 0;
+  virtual bool hasErrorOccured(void) = 0;
+  virtual void clearError() = 0;
+  virtual void setDeviceId(unsigned int deviceId) = 0;
+  virtual void setPlatformIndex(unsigned int platformIndex) = 0;
+  virtual OCLTestImp* toOCLTestImp() = 0;
+  virtual BaseTestImp* toBaseTestImp() = 0;
+  virtual float getPerfInfo() = 0;
+  virtual void clearPerfInfo(void) = 0;
+
+  virtual void setIterationCount(int cnt) = 0;
+  virtual void useCPU() = 0;
+  // Having this return true will allow the creation of the
+  // test to be cached in between runs and will only be
+  // deleted after all the tests are finished running.
+  // This defaults to false as not many tests are modified
+  // to use it.
+  // FIXME: Switch all tests to support caching.
+  virtual bool cache_test() { return true; }
+
+  std::string testDescString;
+  void resetDescString(void) { testDescString.clear(); }
+
+  virtual ~OCLTest(){};
+};
+
+#endif  // _OCLTEST_H_
@@ -0,0 +1,43 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCLMODULE_H_
+#define _OCLMODULE_H_
+
+#ifdef ATI_OS_WIN
+#define OCLLCONV __cdecl
+#endif
+#ifdef ATI_OS_LINUX
+#define OCLLCONV
+#endif
+
+class OCLTest;
+
+//
+//  exported function pointer typedefs
+//
+typedef unsigned int(OCLLCONV *TestCountFuncPtr)(void);
+typedef const char *(OCLLCONV *TestNameFuncPtr)(unsigned int);
+typedef OCLTest *(OCLLCONV *CreateTestFuncPtr)(unsigned int);
+typedef void(OCLLCONV *DestroyTestFuncPtr)(OCLTest *);
+typedef unsigned int(OCLLCONV *TestVersionFuncPtr)(void);
+typedef const char *(OCLLCONV *TestLibNameFuncPtr)(void);
+
+#endif  // _OCLMODULE_H_
@@ -0,0 +1,31 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef OCLTESTUTILS_H_
+#define OCLTESTUTILS_H_
+#include <string>
+
+// @param FN Name of the file to be loaded
+// @param S String to store the loaded file
+// @brief Load file to a string
+// @return true on success
+bool loadFile(const char* FN, std::string& S);
+
+#endif /* OCLTESTUTILS_H_ */
@@ -0,0 +1,614 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef __OCLWrapper_H
+#define __OCLWrapper_H
+
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+#include "CL/cl_gl.h"
+#include "cl_profile_amd.h"
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clUnloadPlatformAMD_fn)(
+    cl_platform_id id);
+
+// Function Pointer Declarations for cl_khr_gl_sharing extension (missing in
+// cl_gl.h)
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clGetGLContextInfoKHR_fn)(
+    const cl_context_properties *properties, cl_gl_context_info param_name,
+    size_t param_value_size, void *param_value, size_t *param_value_size_ret);
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *clCreateFromGLBuffer_fn)(
+    cl_context context, cl_mem_flags flags, unsigned int bufobj,
+    int *errcode_ret);
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *clCreateFromGLTexture_fn)(
+    cl_context context, cl_mem_flags flags, unsigned int texture_target,
+    int miplevel, unsigned int texture, cl_int *errcode_ret);
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *clCreateFromGLTexture2D_fn)(
+    cl_context context, cl_mem_flags flags, unsigned int texture_target,
+    int miplevel, unsigned int texture, cl_int *errcode_ret);
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *clCreateFromGLRenderbuffer_fn)(
+    cl_context context, cl_mem_flags flags, unsigned int renderbuffer,
+    cl_int *errcode_ret);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clGetGLObjectInfo_fn)(
+    cl_mem memobj, cl_gl_object_type *gl_object_type,
+    unsigned int *gl_object_name);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clGetGLTextureInfo_fn)(
+    cl_mem memobj, cl_gl_texture_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clEnqueueAcquireGLObjects_fn)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clEnqueueReleaseGLObjects_fn)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+// Function Pointer Declarations for performance counters
+typedef CL_API_ENTRY cl_perfcounter_amd(CL_API_CALL *clCreatePerfCounterAMD_fn)(
+    cl_device_id device, cl_perfcounter_property *properties,
+    cl_int *errcode_ret);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clEnqueueBeginPerfCounterAMD_fn)(
+    cl_command_queue command_queue, cl_uint num_perf_counters,
+    cl_perfcounter_amd *perf_counters, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clEnqueueEndPerfCounterAMD_fn)(
+    cl_command_queue command_queue, cl_uint num_perf_counters,
+    cl_perfcounter_amd *perf_counters, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clGetPerfCounterInfoAMD_fn)(
+    cl_perfcounter_amd perf_counter, cl_perfcounter_info param_name,
+    size_t param_value_size, void *param_value, size_t *param_value_size_ret);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clReleasePerfCounterAMD_fn)(
+    cl_perfcounter_amd perf_counter);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clRetainPerfCounterAMD_fn)(
+    cl_perfcounter_amd perf_counter);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clSetDeviceClockModeAMD_fn)(
+    cl_device_id device,
+    cl_set_device_clock_mode_input_amd set_clock_mode_input,
+    cl_set_device_clock_mode_output_amd *set_clock_mode_Output);
+
+class OCLWrapper {
+ public:
+  OCLWrapper();
+
+  ~OCLWrapper() {}
+
+  // All OCL APIs are declared in the order they appear in cl.h
+
+  cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms,
+                          cl_uint *num_platforms);
+
+  cl_int clGetPlatformInfo(cl_platform_id platform, cl_platform_info param_name,
+                           size_t param_value_size, void *param_value,
+                           size_t *param_value_size_ret);
+
+  cl_int clGetDeviceIDs(cl_platform_id platform, cl_device_type device_type,
+                        cl_uint num_entries, cl_device_id *devices,
+                        cl_uint *num_devices);
+
+  cl_int clGetDeviceInfo(cl_device_id device, cl_device_info param_name,
+                         size_t param_value_size, void *param_value,
+                         size_t *param_value_size_ret);
+
+  cl_context clCreateContext(cl_context_properties *properties,
+                             cl_uint num_devices, const cl_device_id *devices,
+                             void(CL_CALLBACK *pfn_notify)(const char *,
+                                                           const void *, size_t,
+                                                           void *),
+                             void *user_data, cl_int *errcode_ret);
+
+  cl_context clCreateContextFromType(
+      cl_context_properties *properties, cl_device_type device_type,
+      void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+      void *user_data, cl_int *errcode_ret);
+
+  cl_int clRetainContext(cl_context context);
+
+  cl_int clReleaseContext(cl_context context);
+
+  cl_int clGetContextInfo(cl_context context, cl_context_info param_name,
+                          size_t param_value_size, void *param_value,
+                          size_t *param_value_size_ret);
+
+  cl_command_queue clCreateCommandQueue(cl_context context, cl_device_id device,
+                                        cl_command_queue_properties properties,
+                                        cl_int *errcode_ret);
+
+  cl_int clRetainCommandQueue(cl_command_queue command_queue);
+
+  cl_int clReleaseCommandQueue(cl_command_queue command_queue);
+
+  cl_int clGetCommandQueueInfo(cl_command_queue command_queue,
+                               cl_command_queue_info param_name,
+                               size_t param_value_size, void *param_value,
+                               size_t *param_value_size_ret);
+
+  cl_mem clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size,
+                        void *host_ptr, cl_int *errcode_ret);
+
+  cl_mem clCreateImage2D(cl_context context, cl_mem_flags flags,
+                         const cl_image_format *image_format,
+                         size_t image_width, size_t image_height,
+                         size_t image_row_pitch, void *host_ptr,
+                         cl_int *errcode_ret);
+
+  cl_mem clCreateImage3D(cl_context context, cl_mem_flags flags,
+                         const cl_image_format *image_format,
+                         size_t image_width, size_t image_height,
+                         size_t image_depth, size_t image_row_pitch,
+                         size_t image_slice_pitch, void *host_ptr,
+                         cl_int *errcode_ret);
+
+  cl_int clRetainMemObject(cl_mem memobj);
+
+  cl_int clReleaseMemObject(cl_mem memobj);
+
+  cl_int clGetSupportedImageFormats(cl_context context, cl_mem_flags flags,
+                                    cl_mem_object_type image_type,
+                                    cl_uint num_entries,
+                                    cl_image_format *image_formats,
+                                    cl_uint *num_image_formats);
+
+  cl_int clGetMemObjectInfo(cl_mem memobj, cl_mem_info param_name,
+                            size_t param_value_size, void *param_value,
+                            size_t *param_value_size_ret);
+
+  cl_int clGetImageInfo(cl_mem image, cl_image_info param_name,
+                        size_t param_value_size, void *param_value,
+                        size_t *param_value_size_ret);
+
+  cl_sampler clCreateSampler(cl_context context, cl_bool normalized_coords,
+                             cl_addressing_mode addressing_mode,
+                             cl_filter_mode filter_mode, cl_int *errcode_ret);
+
+  cl_int clRetainSampler(cl_sampler sampler);
+
+  cl_int clReleaseSampler(cl_sampler sampler);
+
+  cl_int clGetSamplerInfo(cl_sampler sampler, cl_sampler_info param_name,
+                          size_t param_value_size, void *param_value,
+                          size_t *param_value_size_ret);
+
+  cl_program clCreateProgramWithSource(cl_context context, cl_uint count,
+                                       const char **strings,
+                                       const size_t *lengths,
+                                       cl_int *errcode_ret);
+
+  cl_program clCreateProgramWithBinary(cl_context context, cl_uint num_devices,
+                                       const cl_device_id *device_list,
+                                       const size_t *lengths,
+                                       const unsigned char **binaries,
+                                       cl_int *binary_status,
+                                       cl_int *errcode_ret);
+
+  cl_int clRetainProgram(cl_program program);
+
+  cl_int clReleaseProgram(cl_program program);
+
+  cl_int clBuildProgram(cl_program program, cl_uint num_devices,
+                        const cl_device_id *device_list, const char *options,
+                        void(CL_CALLBACK *pfn_notify)(cl_program program,
+                                                      void *user_data),
+                        void *user_data);
+
+  cl_int clCompileProgram(
+      cl_program program, cl_uint num_devices, const cl_device_id *device_list,
+      const char *options, cl_uint num_input_headers,
+      const cl_program *input_headers, const char **header_include_names,
+      void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+      void *user_data);
+
+  cl_program clLinkProgram(cl_context context, cl_uint num_devices,
+                           const cl_device_id *device_list, const char *options,
+                           cl_uint num_input_programs,
+                           const cl_program *input_programs,
+                           void(CL_CALLBACK *pfn_notify)(cl_program program,
+                                                         void *user_data),
+                           void *user_data, cl_int *errcode_ret);
+
+  cl_int clUnloadCompiler(void);
+
+  cl_int clUnloadPlatform(cl_platform_id);
+
+  cl_int clGetProgramInfo(cl_program program, cl_program_info param_name,
+                          size_t param_value_size, void *param_value,
+                          size_t *param_value_size_ret);
+
+  cl_int clGetProgramBuildInfo(cl_program program, cl_device_id device,
+                               cl_program_build_info param_name,
+                               size_t param_value_size, void *param_value,
+                               size_t *param_value_size_ret);
+
+  cl_kernel clCreateKernel(cl_program program, const char *kernel_name,
+                           cl_int *errcode_ret);
+
+  cl_int clCreateKernelsInProgram(cl_program program, cl_uint num_kernels,
+                                  cl_kernel *kernels, cl_uint *num_kernels_ret);
+
+  cl_int clRetainKernel(cl_kernel kernel);
+
+  cl_int clReleaseKernel(cl_kernel kernel);
+
+  cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size,
+                        const void *arg_value);
+
+  cl_int clGetKernelInfo(cl_kernel kernel, cl_kernel_info param_name,
+                         size_t param_value_size, void *param_value,
+                         size_t *param_value_size_ret);
+
+  cl_int clGetKernelWorkGroupInfo(cl_kernel kernel, cl_device_id device,
+                                  cl_kernel_work_group_info param_name,
+                                  size_t param_value_size, void *param_value,
+                                  size_t *param_value_size_ret);
+
+  cl_int clWaitForEvents(cl_uint num_events, const cl_event *event_list);
+
+  cl_int clGetEventInfo(cl_event evnt, cl_event_info param_name,
+                        size_t param_value_size, void *param_value,
+                        size_t *param_value_size_ret);
+
+  cl_int clRetainEvent(cl_event evnt);
+
+  cl_int clReleaseEvent(cl_event evnt);
+
+  cl_int clGetEventProfilingInfo(cl_event evnt, cl_profiling_info param_name,
+                                 size_t param_value_size, void *param_value,
+                                 size_t *param_value_size_ret);
+
+  cl_int clFlush(cl_command_queue command_queue);
+
+  cl_int clFinish(cl_command_queue command_queue);
+
+  cl_int clEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer,
+                             cl_bool blocking_read, size_t offset, size_t cb,
+                             void *ptr, cl_uint num_events_in_wait_list,
+                             const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clEnqueueWriteBuffer(cl_command_queue command_queue, cl_mem buffer,
+                              cl_bool blocking_write, size_t offset, size_t cb,
+                              const void *ptr, cl_uint num_events_in_wait_list,
+                              const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clEnqueueCopyBuffer(cl_command_queue command_queue, cl_mem src_buffer,
+                             cl_mem dst_buffer, size_t src_offset,
+                             size_t dst_offset, size_t cb,
+                             cl_uint num_events_in_wait_list,
+                             const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clEnqueueReadImage(cl_command_queue command_queue, cl_mem image,
+                            cl_bool blocking_read, const size_t *origin,
+                            const size_t *region, size_t row_pitch,
+                            size_t slice_pitch, void *ptr,
+                            cl_uint num_events_in_wait_list,
+                            const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clEnqueueWriteImage(cl_command_queue command_queue, cl_mem image,
+                             cl_bool blocking_write, const size_t *origin,
+                             const size_t *region, size_t input_row_pitch,
+                             size_t input_slice_pitch, const void *ptr,
+                             cl_uint num_events_in_wait_list,
+                             const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clEnqueueCopyImage(cl_command_queue command_queue, cl_mem src_image,
+                            cl_mem dst_image, const size_t *src_origin,
+                            const size_t *dst_origin, const size_t *region,
+                            cl_uint num_events_in_wait_list,
+                            const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
+                                    cl_mem src_image, cl_mem dst_buffer,
+                                    const size_t *src_origin,
+                                    const size_t *region, size_t dst_offset,
+                                    cl_uint num_events_in_wait_list,
+                                    const cl_event *event_wait_list,
+                                    cl_event *evnt);
+
+  cl_int clEnqueueCopyBufferToImage(cl_command_queue command_queue,
+                                    cl_mem src_buffer, cl_mem dst_image,
+                                    size_t src_offset, const size_t *dst_origin,
+                                    const size_t *region,
+                                    cl_uint num_events_in_wait_list,
+                                    const cl_event *event_wait_list,
+                                    cl_event *evnt);
+
+  void *clEnqueueMapBuffer(cl_command_queue command_queue, cl_mem buffer,
+                           cl_bool blocking_map, cl_map_flags map_flags,
+                           size_t offset, size_t cb,
+                           cl_uint num_events_in_wait_list,
+                           const cl_event *event_wait_list, cl_event *evnt,
+                           cl_int *errcode_ret);
+
+  void *clEnqueueMapImage(cl_command_queue command_queue, cl_mem image,
+                          cl_bool blocking_map, cl_map_flags map_flags,
+                          const size_t *origin, const size_t *region,
+                          size_t *image_row_pitch, size_t *image_slice_pitch,
+                          cl_uint num_events_in_wait_list,
+                          const cl_event *event_wait_list, cl_event *evnt,
+                          cl_int *errcode_ret);
+
+  cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue, cl_mem memobj,
+                                 void *mapped_ptr,
+                                 cl_uint num_events_in_wait_list,
+                                 const cl_event *event_wait_list,
+                                 cl_event *evnt);
+
+  cl_int clEnqueueNDRangeKernel(
+      cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim,
+      const size_t *global_work_offset, const size_t *global_work_size,
+      const size_t *local_work_size, cl_uint num_events_in_wait_list,
+      const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clEnqueueTask(cl_command_queue command_queue, cl_kernel kernel,
+                       cl_uint num_events_in_wait_list,
+                       const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clEnqueueNativeKernel(cl_command_queue command_queue,
+                               void(CL_CALLBACK *user_func)(void *), void *args,
+                               size_t cb_args, cl_uint num_mem_objects,
+                               const cl_mem *mem_list,
+                               const void **args_mem_loc,
+                               cl_uint num_events_in_wait_list,
+                               const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clEnqueueMarker(cl_command_queue command_queue, cl_event *evnt);
+
+  cl_int clEnqueueMarkerWithWaitList(cl_command_queue command_queue,
+                                     cl_uint num_events_in_wait_list,
+                                     const cl_event *event_wait_list,
+                                     cl_event *evnt);
+
+  cl_int clEnqueueWaitForEvents(cl_command_queue command_queue,
+                                cl_uint num_events, const cl_event *event_list);
+
+  cl_int clEnqueueBarrier(cl_command_queue command_queue);
+
+  void *clGetExtensionFunctionAddress(const char *func_name);
+
+  cl_int clEnqueueReadBufferRect(
+      cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
+      const size_t *buffer_origin, const size_t *host_origin,
+      const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
+      size_t host_row_pitch, size_t host_slice_pitch, void *ptr,
+      cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+      cl_event *evnt);
+
+  cl_int clEnqueueWriteBufferRect(
+      cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
+      const size_t *buffer_origin, const size_t *host_origin,
+      const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
+      size_t host_row_pitch, size_t host_slice_pitch, const void *ptr,
+      cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+      cl_event *evnt);
+
+  cl_int clEnqueueCopyBufferRect(
+      cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
+      const size_t *src_origin, const size_t *dst_origin, const size_t *region,
+      size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch,
+      size_t dst_slice_pitch, cl_uint num_events_in_wait_list,
+      const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_mem clCreateImage(cl_context context, cl_mem_flags flags,
+                       const cl_image_format *image_format,
+                       const cl_image_desc *image_desc, void *host_ptr,
+                       cl_int *errcode_ret);
+
+  cl_mem clCreateSubBuffer(cl_mem mem, cl_mem_flags flags,
+                           cl_buffer_create_type buffer_create_type,
+                           const void *buffer_create_info, cl_int *errcode_ret);
+
+  cl_int clSetEventCallback(
+      cl_event event, cl_int command_exec_callback_type,
+      void(CL_CALLBACK *pfn_event_notify)(cl_event event,
+                                          cl_int event_command_exec_status,
+                                          void *user_data),
+      void *user_data);
+
+  cl_int clEnqueueFillImage(cl_command_queue command_queue, cl_mem image,
+                            void *ptr, const size_t *origin,
+                            const size_t *region,
+                            cl_uint num_events_in_wait_list,
+                            const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clUnloadPlatformAMD(cl_platform_id id);
+
+  cl_int clEnqueueWaitSignalAMD(cl_command_queue command_queue,
+                                cl_mem mem_object, cl_uint value,
+                                cl_uint num_events,
+                                const cl_event *event_wait_list,
+                                cl_event *event);
+
+  cl_int clEnqueueWriteSignalAMD(cl_command_queue command_queue,
+                                 cl_mem mem_object, cl_uint value,
+                                 cl_ulong offset, cl_uint num_events,
+                                 const cl_event *event_list, cl_event *event);
+
+  cl_int clEnqueueMakeBuffersResidentAMD(
+      cl_command_queue command_queue, cl_uint num_mem_objs, cl_mem *mem_objects,
+      cl_bool blocking_make_resident, cl_bus_address_amd *bus_addresses,
+      cl_uint num_events, const cl_event *event_list, cl_event *event);
+
+  cl_int clEnqueueMigrateMemObjects(cl_command_queue command_queue,
+                                    cl_uint num_mem_objects,
+                                    const cl_mem *mem_objects,
+                                    cl_mem_migration_flags flags,
+                                    cl_uint num_events_in_wait_list,
+                                    const cl_event *event_wait_list,
+                                    cl_event *event);
+
+  // CL-GL Extension: cl_khr_gl_sharing
+  cl_int clGetGLContextInfoKHR(const cl_context_properties *properties,
+                               cl_gl_context_info param_name,
+                               size_t param_value_size, void *param_value,
+                               size_t *param_value_size_ret);
+
+  cl_mem clCreateFromGLBuffer(cl_context context, cl_mem_flags flags,
+                              unsigned int bufobj, int *errcode_ret);
+
+  cl_mem clCreateFromGLTexture(cl_context context, cl_mem_flags flags,
+                               unsigned int texture_target, int miplevel,
+                               unsigned int texture, cl_int *errcode_ret);
+
+  cl_mem clCreateFromGLTexture2D(cl_context context, cl_mem_flags flags,
+                                 unsigned int texture_target, int miplevel,
+                                 unsigned int texture, cl_int *errcode_ret);
+
+  cl_mem clCreateFromGLRenderbuffer(cl_context context, cl_mem_flags flags,
+                                    unsigned int renderbuffer,
+                                    cl_int *errcode_ret);
+
+  cl_int clGetGLObjectInfo(cl_mem memobj, cl_gl_object_type *gl_object_type,
+                           unsigned int *gl_object_name);
+
+  cl_int clGetGLTextureInfo(cl_mem memobj, cl_gl_texture_info param_name,
+                            size_t param_value_size, void *param_value,
+                            size_t *param_value_size_ret);
+
+  cl_int clEnqueueAcquireGLObjects(cl_command_queue command_queue,
+                                   cl_uint num_objects,
+                                   const cl_mem *mem_objects,
+                                   cl_uint num_events_in_wait_list,
+                                   const cl_event *event_wait_list,
+                                   cl_event *event);
+
+  cl_int clEnqueueReleaseGLObjects(cl_command_queue command_queue,
+                                   cl_uint num_objects,
+                                   const cl_mem *mem_objects,
+                                   cl_uint num_events_in_wait_list,
+                                   const cl_event *event_wait_list,
+                                   cl_event *event);
+
+#if defined(CL_VERSION_2_0)
+  cl_command_queue clCreateCommandQueueWithProperties(
+      cl_context context, cl_device_id device,
+      const cl_queue_properties *properties, cl_int *errcode_ret);
+
+  void *clSVMAlloc(cl_context context, cl_svm_mem_flags flags, size_t size,
+                   cl_uint alignment);
+
+  void clSVMFree(cl_context context, void *svm_pointer);
+
+  cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map,
+                         cl_map_flags flags, void *svm_ptr, size_t size,
+                         cl_uint num_events_in_wait_list,
+                         const cl_event *event_wait_list, cl_event *event);
+
+  cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, void *svm_ptr,
+                           cl_uint num_events_in_wait_list,
+                           const cl_event *event_wait_list, cl_event *event);
+
+  cl_int clEnqueueSVMMemFill(cl_command_queue command_queue, void *svm_ptr,
+                             const void *pattern, size_t pattern_size,
+                             size_t size, cl_uint num_events_in_wait_list,
+                             const cl_event *event_wait_list, cl_event *event);
+
+  cl_int clSetKernelArgSVMPointer(cl_kernel kernel, cl_uint arg_index,
+                                  const void *arg_value);
+
+  cl_mem clCreatePipe(cl_context context, cl_mem_flags flags,
+                      cl_uint packet_size, cl_uint num_packets,
+                      const cl_pipe_properties *properties,
+                      cl_int *errcode_ret);
+
+  cl_int clGetPipeInfo(cl_mem pipe, cl_pipe_info param_name,
+                       size_t param_value_size, void *param_value,
+                       size_t *param_value_size_ret);
+
+#endif
+
+  cl_perfcounter_amd clCreatePerfCounterAMD(cl_device_id device,
+                                            cl_perfcounter_property *properties,
+                                            cl_int *errcode_ret);
+
+  cl_int clEnqueueBeginPerfCounterAMD(cl_command_queue command_queue,
+                                      cl_uint num_perf_counters,
+                                      cl_perfcounter_amd *perf_counters,
+                                      cl_uint num_events_in_wait_list,
+                                      const cl_event *event_wait_list,
+                                      cl_event *event);
+
+  cl_int clEnqueueEndPerfCounterAMD(cl_command_queue command_queue,
+                                    cl_uint num_perf_counters,
+                                    cl_perfcounter_amd *perf_counters,
+                                    cl_uint num_events_in_wait_list,
+                                    const cl_event *event_wait_list,
+                                    cl_event *event);
+
+  cl_int clGetPerfCounterInfoAMD(cl_perfcounter_amd perf_counter,
+                                 cl_perfcounter_info param_name,
+                                 size_t param_value_size, void *param_value,
+                                 size_t *param_value_size_ret);
+
+  cl_int clReleasePerfCounterAMD(cl_perfcounter_amd perf_counter);
+
+  cl_int clRetainPerfCounterAMD(cl_perfcounter_amd perf_counter);
+
+  cl_int clSetDeviceClockModeAMD(
+      cl_device_id device,
+      cl_set_device_clock_mode_input_amd set_clock_mode_input,
+      cl_set_device_clock_mode_output_amd *set_clock_mode_Output);
+
+ private:
+  clEnqueueWaitSignalAMD_fn clEnqueueWaitSignalAMD_ptr;
+  clEnqueueWriteSignalAMD_fn clEnqueueWriteSignalAMD_ptr;
+  clEnqueueMakeBuffersResidentAMD_fn clEnqueueMakeBuffersResidentAMD_ptr;
+
+  // Unload the platform
+  clUnloadPlatformAMD_fn clUnloadPlatformAMD_ptr;
+
+  // CL-GL Extension: cl_khr_gl_sharing
+  clGetGLContextInfoKHR_fn clGetGLContextInfoKHR_ptr;
+  clCreateFromGLBuffer_fn clCreateFromGLBuffer_ptr;
+  clCreateFromGLTexture_fn clCreateFromGLTexture_ptr;
+  clCreateFromGLTexture2D_fn clCreateFromGLTexture2D_ptr;
+  clCreateFromGLRenderbuffer_fn clCreateFromGLRenderbuffer_ptr;
+  clGetGLObjectInfo_fn clGetGLObjectInfo_ptr;
+  clGetGLTextureInfo_fn clGetGLTextureInfo_ptr;
+  clEnqueueAcquireGLObjects_fn clEnqueueAcquireGLObjects_ptr;
+  clEnqueueReleaseGLObjects_fn clEnqueueReleaseGLObjects_ptr;
+
+  // Performance counters
+  clCreatePerfCounterAMD_fn clCreatePerfCounterAMD_ptr;
+  clEnqueueBeginPerfCounterAMD_fn clEnqueueBeginPerfCounterAMD_ptr;
+  clEnqueueEndPerfCounterAMD_fn clEnqueueEndPerfCounterAMD_ptr;
+  clGetPerfCounterInfoAMD_fn clGetPerfCounterInfoAMD_ptr;
+  clReleasePerfCounterAMD_fn clReleasePerfCounterAMD_ptr;
+  clRetainPerfCounterAMD_fn clRetainPerfCounterAMD_ptr;
+  // Set clockMode
+  clSetDeviceClockModeAMD_fn clSetDeviceClockModeAMD_ptr;
+};
+
+#endif
@@ -0,0 +1,104 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "oclTestLog.h"
+
+#include <cassert>
+#include <cstring>
+
+#include "OCLLog.h"
+
+oclLog::oclLog()
+    : m_stdout_fp(stdout), m_filename(""), m_writeToFileIsEnabled(false) {}
+
+oclLog::~oclLog() { disable_write_to_file(); }
+
+void oclLog::enable_write_to_file(std::string filename) {
+  m_writeToFileIsEnabled = true;
+  m_filename = filename;
+  FILE* fp = fopen(m_filename.c_str(), "w");
+  if (fp == NULL) {
+    oclTestLog(OCLTEST_LOG_ALWAYS,
+               "ERROR: Cannot open file %s. Disabling logging to file.\n",
+               filename.c_str());
+    m_writeToFileIsEnabled = false;
+  } else {
+    fclose(fp);
+  }
+}
+
+void oclLog::disable_write_to_file() { m_writeToFileIsEnabled = false; }
+
+void oclLog::vprint(char const* fmt, va_list args) {
+  // hack for fixing the lnx64bit segfault and
+  // garbage printing in file. XXX 2048 a magic number
+  char buffer[4096];
+
+  memset(buffer, 0, sizeof(buffer));
+  int rc = vsnprintf(buffer, sizeof(buffer), fmt, args);
+  assert(rc >= 0 && rc != sizeof(buffer));
+
+  fputs(buffer, m_stdout_fp);
+  if (m_writeToFileIsEnabled) {
+    FILE* fp = fopen(m_filename.c_str(), "a");
+    if (fp == NULL) {
+      oclTestLog(OCLTEST_LOG_ALWAYS,
+                 "ERROR: Cannot open file %s. Disabling logging to file.\n",
+                 m_filename.c_str());
+      m_writeToFileIsEnabled = false;
+    }
+    fputs(buffer, fp);
+    fclose(fp);
+  }
+}
+
+void oclLog::flush() { fflush(m_stdout_fp); }
+
+static oclLog& theLog() {
+  static oclLog Log;
+  return Log;
+}
+
+static oclLoggingLevel currentLevel = OCLTEST_LOG_ALWAYS;
+static float logcount = 0.0f;
+
+void oclTestLog(oclLoggingLevel logLevel, const char* fmt, ...) {
+  logcount += 1.0f;
+
+  if (logLevel <= currentLevel) {
+    va_list args;
+    va_start(args, fmt);
+
+    theLog().vprint(fmt, args);
+    theLog().flush();
+
+    va_end(args);
+  }
+}
+
+void oclTestEnableLogToFile(const char* filename) {
+  theLog().enable_write_to_file(filename);
+}
+
+void oclTestSetLogLevel(int level) {
+  if (level >= 0) {
+    currentLevel = static_cast<oclLoggingLevel>(level);
+  }
+}
@@ -0,0 +1,44 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef CALTESTLOG_H_
+#define CALTESTLOG_H_
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#include <string>
+
+class oclLog {
+ public:
+  oclLog();
+  virtual ~oclLog();
+  virtual void vprint(char const* fmt, va_list args);
+  virtual void flush();
+  virtual void enable_write_to_file(std::string filename);
+  virtual void disable_write_to_file();
+
+ private:
+  FILE* m_stdout_fp;
+  std::string m_filename;
+  bool m_writeToFileIsEnabled;
+};
+
+#endif  // CALTESTLOG_H_
@@ -0,0 +1,185 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "BaseTestImp.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cstring>
+
+/////////////////////////////////////////////////////////////////////////////
+
+static unsigned int crcinit(unsigned int crc);
+static int initializeSeed(void);
+
+/////////////////////////////////////////////////////////////////////////////
+
+BaseTestImp::BaseTestImp()
+    : _numSubTests(0), _openTest(0), _deviceName(NULL), _architecture(0) {
+  _cpu = false;
+  unsigned int i;
+  for (i = 0; i < 256; i++) {
+    _crctab[i] = crcinit(i << 24);
+  }
+  _crcword = ~0;
+  _deviceId = 0;
+  _platformIndex = 0;
+  _perfInfo = 0.0f;
+
+#ifdef ATI_OS_LINUX  //
+  _useThreads = 0;  // disable threads on linux
+#else
+  _useThreads = 1;  // if available on platform
+#endif
+
+  clearError();
+}
+
+void BaseTestImp::checkComplib(unsigned int test, const char *deviceName,
+                               unsigned int architecture) {
+  BaseTestImp::open();
+  devices_ = 0;
+  deviceCount_ = 0;
+  context_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  type_ = CL_DEVICE_TYPE_GPU;
+
+  cl_uint numPlatforms = 0;
+  error_ = clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clGetPlatformIDs failed");
+  CHECK_RESULT((numPlatforms == 0), "No platform found");
+
+  cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+  error_ = clGetPlatformIDs(numPlatforms, platforms, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+
+  cl_platform_id platform = 0;
+#if 0
+  for(unsigned int i = 0; i < numPlatforms; ++i)
+  {
+    char buff[200];
+    error_ = clGetPlatformInfo(platforms[i],CL_PLATFORM_VENDOR, sizeof(buff), buff, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+    if(strcmp(buff, "Advanced Micro Devices, Inc.") == 0)
+    {
+      platform = platforms[i];
+      break;
+    }
+  }
+#endif
+  platform = platforms[_platformIndex];
+
+  delete[] platforms;
+
+  CHECK_RESULT((platform == 0), "AMD Platform not found");
+
+  error_ = clGetDeviceIDs(platform, type_, 0, NULL, &deviceCount_);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed");
+
+  devices_ = new cl_device_id[deviceCount_];
+  error_ = clGetDeviceIDs(platform, type_, deviceCount_, devices_, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed");
+
+  char device_string[200];
+  clGetDeviceInfo(devices_[_deviceId], CL_DRIVER_VERSION, sizeof(device_string),
+                  &device_string, NULL);
+  if (strstr(device_string, "LC")) {
+    printf("Skipping test since it does not run with LC\n");
+    failed_ = true;
+    return;
+  }
+  return;
+}
+
+BaseTestImp::~BaseTestImp() {}
+
+void BaseTestImp::open() {
+  _crcword = 0;
+  clearError();
+}
+void BaseTestImp::open(unsigned int test, const char *deviceName,
+                       unsigned int architecture) {
+  open();
+}
+
+unsigned int BaseTestImp::close() { return _crcword; }
+
+unsigned int BaseTestImp::getThreadUsage(void) { return _useThreads; }
+
+int BaseTestImp::getNumSubTests(void) { return _numSubTests; }
+
+void BaseTestImp::setDeviceName(const char *name) { _deviceName = name; }
+
+const char *BaseTestImp::getDeviceName() { return _deviceName; }
+
+float BaseTestImp::getPerfInfo(void) { return _perfInfo; }
+
+void BaseTestImp::clearPerfInfo(void) { _perfInfo = 0.0; }
+
+void BaseTestImp::setDeviceId(unsigned int deviceId) { _deviceId = deviceId; }
+
+void BaseTestImp::setIterationCount(int cnt) { _iterationCnt = cnt; }
+
+unsigned int BaseTestImp::getDeviceId() { return _deviceId; }
+
+void BaseTestImp::setPlatformIndex(unsigned int platformIndex) {
+  _platformIndex = platformIndex;
+}
+
+unsigned int BaseTestImp::getPlatformIndex() { return _platformIndex; }
+
+void BaseTestImp::setErrorMsg(const char *error) {
+  _errorFlag = true;
+  _errorMsg.assign((const char *)error);
+}
+
+const char *BaseTestImp::getErrorMsg() { return _errorMsg.c_str(); }
+
+bool BaseTestImp::hasErrorOccured() { return _errorFlag; }
+
+void BaseTestImp::clearError() {
+  _errorFlag = false;
+  _errorMsg.clear();
+}
+
+/////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Same CRC32 as used by ogtst
+//
+static const unsigned int CRCMASK = 0x04c11db7;
+
+static unsigned int crcinit(unsigned int crc) {
+  int i;
+  unsigned int ans = crc;
+
+  for (i = 0; i < 8; i++) {
+    if (ans & 0x80000000) {
+      ans = (ans << 1) ^ CRCMASK;
+    } else {
+      ans <<= 1;
+    }
+  }
+  return (ans);
+}
@@ -0,0 +1,175 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLCommon.h"
+
+#include <cmath>
+#include <cstring>
+
+void OCLGLCommon::open(unsigned int test, char *units, double &conversion,
+                       unsigned int deviceId) {
+  // OpenCL Initialization
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test (%d)", error_);
+
+  char name[1024] = {0};
+  size_t size = 0;
+
+  if (deviceId >= deviceCount_) {
+    _errorFlag = true;
+    return;
+  }
+
+  // Check that the device supports CL/GL interop extension
+  _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 1024,
+                            name, &size);
+  if (!strstr(name, "cl_khr_gl_sharing")) {
+    printf("KHR GL sharing extension is required for this test!\n");
+    _errorFlag = true;
+    return;
+  }
+
+  // OpenGL Initialization
+  bool retVal = initializeGLContext(hGL_);
+  CHECK_RESULT((retVal == CL_SUCCESS), "Error opening test (%d)", error_);
+
+  createCLContextFromGLContext(hGL_);
+}
+
+bool OCLGLCommon::IsGLEnabled(unsigned int test, char *units,
+                              double &conversion, unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  bool bResult = initializeGLContext(hGL_);
+  if (bResult) {
+    deleteGLContext(hGL_);
+  }
+  OCLTestImp::close();
+  return bResult;
+}
+
+void OCLGLCommon::gluPerspective(double fovy, double aspect, double zNear,
+                                 double zFar) {
+  double xmin, xmax, ymin, ymax;
+  ymax = zNear * tan(fovy * 3.149 / 360.0);
+  ymin = -ymax;
+  xmin = ymin * aspect;
+  xmax = ymax * aspect;
+  glFrustum(xmin, xmax, ymin, ymax, zNear, zFar);
+}
+
+unsigned int OCLGLCommon::close(void) {
+  makeCurrent(hGL_);
+  unsigned int retVal = OCLTestImp::close();
+  deleteGLContext(hGL_);
+  return retVal;
+}
+
+void OCLGLCommon::dumpBuffer(float *pBuffer, const char fileName[],
+                             unsigned int dimSize) {
+  if (pBuffer) {
+    FILE *f = fopen(fileName, "w");
+    if (NULL != f) {
+      unsigned int i, j;
+      for (i = 0; i < dimSize; i++) {
+        for (j = 0; j < dimSize; j++) {
+          fprintf(f, "%e,\t", pBuffer[i * (dimSize) + j]);
+        }
+        fprintf(f, "\n");
+      }
+      fclose(f);
+    }
+  }
+}
+
+bool OCLGLCommon::createGLFragmentProgramFromSource(const char *source,
+                                                    GLuint &shader,
+                                                    GLuint &program) {
+  shader = glCreateShader(GL_FRAGMENT_SHADER);
+  glShaderSource(shader, 1, &source, NULL);
+  glCompileShader(shader);
+  printShaderInfoLog(shader);
+  program = glCreateProgram();
+  glAttachShader(program, shader);
+  glLinkProgram(program);
+  printProgramInfoLog(program);
+
+  return program != 0;
+}
+
+int OCLGLCommon::printOglError(char *file, int line) {
+  //
+  // Returns 1 if an OpenGL error occurred, 0 otherwise.
+  //
+  GLenum glErr;
+  int retCode = 0;
+
+  glErr = glGetError();
+  if (glErr != GL_NO_ERROR) {
+    printf("glError in file %s @ line %d: %d\n", file, line, glErr);
+    retCode = 1;
+  }
+  return retCode;
+}
+
+//
+// Print out the information log for a shader object
+//
+void OCLGLCommon::printShaderInfoLog(GLuint shader) {
+  int infologLength = 0;
+  int charsWritten = 0;
+  GLchar *infoLog;
+
+  glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &infologLength);
+
+  if (infologLength > 0) {
+    infoLog = (GLchar *)malloc(infologLength);
+    if (infoLog == NULL) {
+      printf("ERROR: Could not allocate InfoLog buffer\n");
+      return;
+    }
+    glGetShaderInfoLog(shader, infologLength, &charsWritten, infoLog);
+    printf("Shader InfoLog:\n%s\n\n", infoLog);
+    free(infoLog);
+  }
+}
+
+void OCLGLCommon::printProgramInfoLog(GLuint program) {
+  int infologLength = 0;
+  int charsWritten = 0;
+  GLchar *infoLog;
+
+  // printOpenGLError();  // Check for OpenGL errors
+
+  glGetProgramiv(program, GL_INFO_LOG_LENGTH, &infologLength);
+
+  // printOpenGLError();  // Check for OpenGL errors
+
+  if (infologLength > 0) {
+    infoLog = (GLchar *)malloc(infologLength);
+    if (infoLog == NULL) {
+      printf("ERROR: Could not allocate InfoLog buffer\n");
+      exit(1);
+    }
+    glGetProgramInfoLog(program, infologLength, &charsWritten, infoLog);
+    printf("Program InfoLog:\n%s\n\n", infoLog);
+    free(infoLog);
+  }
+  //  printOpenGLError();  // Check for OpenGL errors
+}
@@ -0,0 +1,80 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GL_COMMON_H_
+#define _OCL_GL_COMMON_H_
+
+#include <GL/glew.h>
+#include <GL/gl.h>
+#include <GL/glx.h>
+
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+
+#include "OCLTestImp.h"
+
+typedef struct OCLGLHandle_* OCLGLHandle;
+
+#define printOpenGLError() OCLGLCommon::printOglError(__FILE__, __LINE__)
+
+class OCLGLCommon : public OCLTestImp {
+ public:
+  /////////////////////////////////////////
+  // private initialization and clean-up //
+  /////////////////////////////////////////
+  OCLGLCommon();
+  virtual ~OCLGLCommon();
+  ///////////////////////
+  // virtual interface //
+  ///////////////////////
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual unsigned int close(void);
+  static void gluPerspective(double fovy, double aspect, double zNear,
+                             double zFar);
+  static void dumpBuffer(float* pBuffer, const char fileName[],
+                         unsigned int dimSize);
+  static int printOglError(char* file, int line);
+  static bool createGLFragmentProgramFromSource(const char* source,
+                                                GLuint& shader,
+                                                GLuint& program);
+  static void printShaderInfoLog(GLuint shader);
+  static void printProgramInfoLog(GLuint program);
+
+ protected:
+  const OCLGLHandle getGLHandle() { return hGL_; }
+  void makeCurrent(const OCLGLHandle hGL);
+  void getCLContextPropertiesFromGLContext(const OCLGLHandle hGL,
+                                           cl_context_properties properties[7]);
+  bool createGLContext(OCLGLHandle& hGL);
+  void destroyGLContext(OCLGLHandle& hGL);
+  bool IsGLEnabled(unsigned int test, char* units, double& conversion,
+                   unsigned int deviceId);
+
+ private:
+  bool initializeGLContext(OCLGLHandle& hGL);
+  void deleteGLContext(OCLGLHandle& hGL);
+  bool checkAssociationDeviceWithGLContext(OCLGLHandle& hGL);
+  void createCLContextFromGLContext(OCLGLHandle& hGL);
+
+  OCLGLHandle hGL_;
+};
+
+#endif  // _OCL_GL_COMMON_H_
@@ -0,0 +1,239 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLCommon.h"
+
+struct OCLGLHandle_ {
+  static Display* display;
+  static XVisualInfo* vInfo;
+  static int referenceCount;
+  GLXContext context;
+  Window window;
+  Colormap cmap;
+};
+
+Display* OCLGLHandle_::display = NULL;
+XVisualInfo* OCLGLHandle_::vInfo = NULL;
+int OCLGLHandle_::referenceCount = 0;
+
+OCLGLCommon::OCLGLCommon() {
+  hGL_ = new OCLGLHandle_;
+
+  hGL_->context = NULL;
+  hGL_->window = 0;
+  hGL_->cmap = 0;
+}
+
+OCLGLCommon::~OCLGLCommon() { destroyGLContext(hGL_); }
+
+void OCLGLCommon::destroyGLContext(OCLGLHandle& hGL) {
+  deleteGLContext(hGL);
+  delete hGL;
+  hGL = NULL;
+}
+
+void OCLGLCommon::deleteGLContext(OCLGLHandle& hGL) {
+  if (hGL->display != NULL) {
+    glXMakeCurrent(hGL->display, None, NULL);
+    if (hGL->cmap) {
+      XFreeColormap(hGL->display, hGL->cmap);
+      hGL->cmap = 0;
+    }
+    if (hGL->window) {
+      XDestroyWindow(hGL->display, hGL->window);
+      hGL->window = 0;
+    }
+    if (hGL->context) {
+      glXDestroyContext(hGL->display, hGL->context);
+      hGL->context = NULL;
+    }
+
+    hGL->referenceCount--;
+    if (hGL->referenceCount == 0) {
+      XCloseDisplay(hGL->display);
+      hGL->display = NULL;
+
+      XFree(hGL->vInfo);
+      hGL->vInfo = NULL;
+    }
+  }
+}
+
+bool OCLGLCommon::createGLContext(OCLGLHandle& hGL) {
+  hGL = new OCLGLHandle_;
+  return initializeGLContext(hGL);
+}
+
+bool OCLGLCommon::initializeGLContext(OCLGLHandle& hGL) {
+  if (hGL->display == NULL) {
+    hGL->display = XOpenDisplay(NULL);
+    if (hGL->display == NULL) {
+      printf("XOpenDisplay() failed\n");
+      return false;
+    }
+  }
+  if (hGL->vInfo == NULL) {
+    int dblBuf[] = {GLX_RGBA, GLX_RED_SIZE,     1,   GLX_GREEN_SIZE,
+                    1,        GLX_BLUE_SIZE,    1,   GLX_DEPTH_SIZE,
+                    12,       GLX_DOUBLEBUFFER, None};
+
+    hGL->vInfo =
+        glXChooseVisual(hGL->display, DefaultScreen(hGL->display), dblBuf);
+    if (hGL->vInfo == NULL) {
+      printf("glXChooseVisual() failed\n");
+      return false;
+    }
+  }
+  hGL->referenceCount++;
+
+  hGL->context = glXCreateContext(hGL->display, hGL->vInfo, None, True);
+  if (hGL->context == NULL) {
+    printf("glXCreateContext() failed\n");
+    return false;
+  }
+
+  XSetWindowAttributes swa = {0};
+  hGL->cmap = XCreateColormap(hGL->display,
+                              RootWindow(hGL->display, hGL->vInfo->screen),
+                              hGL->vInfo->visual, AllocNone);
+  swa.colormap = hGL->cmap;
+  hGL->window = XCreateWindow(
+      hGL->display, RootWindow(hGL->display, hGL->vInfo->screen), 0, 0, 640,
+      480, 0, hGL->vInfo->depth, InputOutput, hGL->vInfo->visual,
+      CWBorderPixel | CWColormap | CWEventMask, &swa);
+
+  Bool glErr = glXMakeCurrent(hGL->display, hGL->window, hGL->context);
+  if (False == glErr) {
+    return false;
+  }
+
+  if (!checkAssociationDeviceWithGLContext(hGL)) {
+    deleteGLContext(hGL);
+    return false;
+  }
+  return true;
+}
+
+bool OCLGLCommon::checkAssociationDeviceWithGLContext(OCLGLHandle& hGL) {
+  bool ret = false;
+  size_t devicesSize = 0;
+  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
+                                        (cl_context_properties)platform_,
+                                        CL_GL_CONTEXT_KHR,
+                                        (cl_context_properties)hGL->context,
+                                        CL_GLX_DISPLAY_KHR,
+                                        (cl_context_properties)hGL->display,
+                                        0};
+
+  error_ = _wrapper->clGetGLContextInfoKHR(
+      properties, CL_DEVICES_FOR_GL_CONTEXT_KHR, 0, NULL, &devicesSize);
+  if (error_ != CL_SUCCESS) {
+    printf("clGetGLContextInfoKHR failed (%d)\n", error_);
+    return false;
+  }
+
+  cl_uint numDevices = (cl_uint)devicesSize / sizeof(cl_device_id);
+  cl_device_id* interopDevices = (cl_device_id*)malloc(devicesSize);
+
+  error_ =
+      _wrapper->clGetGLContextInfoKHR(properties, CL_DEVICES_FOR_GL_CONTEXT_KHR,
+                                      devicesSize, interopDevices, NULL);
+  if (error_ != CL_SUCCESS) {
+    printf("clGetGLContextInfoKHR failed (%d)\n", error_);
+    free(interopDevices);
+    return false;
+  }
+
+  // Check that current device can be associated with OpenGL context
+  for (unsigned int i = 0; i < numDevices; i++) {
+    if (interopDevices[i] == devices_[_deviceId]) {
+      ret = true;
+      break;
+    }
+  }
+
+  free(interopDevices);
+  return ret;
+}
+
+void OCLGLCommon::createCLContextFromGLContext(OCLGLHandle& hGL) {
+  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
+                                        (cl_context_properties)platform_,
+                                        CL_GL_CONTEXT_KHR,
+                                        (cl_context_properties)hGL->context,
+                                        CL_GLX_DISPLAY_KHR,
+                                        (cl_context_properties)hGL->display,
+                                        0};
+
+  // Release current command queue
+  if (cmdQueues_[_deviceId]) {
+    error_ = _wrapper->clReleaseCommandQueue(cmdQueues_[_deviceId]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseCommandQueue() failed");
+  }
+
+  // Release current context
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseContext() failed");
+  }
+
+  // Create new CL context from GL context
+  context_ =
+      clCreateContext(properties, 1, &devices_[_deviceId], NULL, NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext() failed (%d)", error_);
+
+  // Create command queue for new context
+  cmdQueues_[_deviceId] =
+      _wrapper->clCreateCommandQueue(context_, devices_[_deviceId], 0, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed (%d)",
+               error_);
+
+  // GLEW versions 1.13.0 and earlier do not fetch all GL function pointers
+  // without glewExperimental set.
+  glewExperimental = GL_TRUE;
+  GLenum glErr = glewInit();
+  CHECK_RESULT((glErr != GLEW_OK), "glewInit() failed: %s",
+               glewGetErrorString(glErr));
+}
+
+void OCLGLCommon::makeCurrent(OCLGLHandle hGL) {
+  if (hGL == NULL) {
+    if (hGL_ != NULL) {
+      glXMakeCurrent(hGL_->display, None, NULL);
+    }
+  } else {
+    bool ret = glXMakeCurrent(hGL->display, hGL->window, hGL->context);
+    assert(ret && "glXMakeCurrent failed!");
+  }
+}
+
+void OCLGLCommon::getCLContextPropertiesFromGLContext(
+    const OCLGLHandle hGL, cl_context_properties properties[7]) {
+  if (!properties) return;
+
+  properties[0] = CL_CONTEXT_PLATFORM;
+  properties[1] = (cl_context_properties)platform_;
+  properties[2] = CL_GL_CONTEXT_KHR;
+  properties[3] = (cl_context_properties)hGL->context;
+  properties[4] = CL_GLX_DISPLAY_KHR;
+  properties[5] = (cl_context_properties)hGL->display;
+  properties[6] = 0;
+}
@@ -0,0 +1,239 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLCommon.h"
+
+struct OCLGLHandle_ {
+  HDC hdc;
+  HGLRC hglrc;
+};
+
+OCLGLCommon::OCLGLCommon() {
+  hGL_ = new OCLGLHandle_;
+
+  hGL_->hdc = NULL;
+  hGL_->hglrc = NULL;
+}
+
+OCLGLCommon::~OCLGLCommon() { destroyGLContext(hGL_); }
+
+void OCLGLCommon::destroyGLContext(OCLGLHandle& hGL) {
+  deleteGLContext(hGL);
+  delete hGL;
+  hGL = NULL;
+}
+
+void OCLGLCommon::deleteGLContext(OCLGLHandle& hGL) {
+  wglMakeCurrent(NULL, NULL);
+  if (hGL->hglrc) {
+    wglDeleteContext(hGL->hglrc);
+    hGL->hglrc = NULL;
+  }
+  if (hGL->hdc) {
+    DeleteDC(hGL->hdc);
+    hGL->hdc = NULL;
+  }
+}
+
+bool OCLGLCommon::createGLContext(OCLGLHandle& hGL) {
+  hGL = new OCLGLHandle_;
+  return initializeGLContext(hGL);
+}
+
+bool OCLGLCommon::initializeGLContext(OCLGLHandle& hGL) {
+  BOOL glErr = FALSE;
+  DISPLAY_DEVICE dispDevice;
+  DWORD deviceNum;
+  int pfmt;
+  PIXELFORMATDESCRIPTOR pfd;
+  pfd.nSize = sizeof(PIXELFORMATDESCRIPTOR);
+  pfd.nVersion = 1;
+  pfd.dwFlags = PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER;
+  pfd.iPixelType = PFD_TYPE_RGBA;
+  pfd.cColorBits = 24;
+  pfd.cRedBits = 8;
+  pfd.cRedShift = 0;
+  pfd.cGreenBits = 8;
+  pfd.cGreenShift = 0;
+  pfd.cBlueBits = 8;
+  pfd.cBlueShift = 0;
+  pfd.cAlphaBits = 8;
+  pfd.cAlphaShift = 0;
+  pfd.cAccumBits = 0;
+  pfd.cAccumRedBits = 0;
+  pfd.cAccumGreenBits = 0;
+  pfd.cAccumBlueBits = 0;
+  pfd.cAccumAlphaBits = 0;
+  pfd.cDepthBits = 24;
+  pfd.cStencilBits = 8;
+  pfd.cAuxBuffers = 0;
+  pfd.iLayerType = PFD_MAIN_PLANE;
+  pfd.bReserved = 0;
+  pfd.dwLayerMask = 0;
+  pfd.dwVisibleMask = 0;
+  pfd.dwDamageMask = 0;
+
+  dispDevice.cb = sizeof(DISPLAY_DEVICE);
+  for (deviceNum = 0; EnumDisplayDevices(NULL, deviceNum, &dispDevice, 0);
+       deviceNum++) {
+    if (dispDevice.StateFlags & DISPLAY_DEVICE_MIRRORING_DRIVER) {
+      continue;
+    }
+
+    hGL->hdc = CreateDC(NULL, dispDevice.DeviceName, NULL, NULL);
+    if (!hGL->hdc) {
+      continue;
+    }
+
+    pfmt = ChoosePixelFormat(hGL->hdc, &pfd);
+    if (pfmt == 0) {
+      printf("Failed choosing the requested PixelFormat.\n");
+      return false;
+    }
+
+    glErr = SetPixelFormat(hGL->hdc, pfmt, &pfd);
+    if (glErr == FALSE) {
+      printf("Failed to set the requested PixelFormat.\n");
+      return false;
+    }
+
+    hGL->hglrc = wglCreateContext(hGL->hdc);
+    if (NULL == hGL->hglrc) {
+      printf("wglCreateContext() failed\n");
+      return false;
+    }
+
+    glErr = wglMakeCurrent(hGL->hdc, hGL->hglrc);
+    if (FALSE == glErr) {
+      printf("wglMakeCurrent() failed\n");
+      return false;
+    }
+
+    if (!checkAssociationDeviceWithGLContext(hGL)) {
+      deleteGLContext(hGL);
+      return false;
+    }
+
+    return true;
+  }  //  for (deviceNum = 0; EnumDisplayDevices(NULL, deviceNum, &dispDevice,
+     //  0); deviceNum++) {
+
+  return false;
+}
+
+bool OCLGLCommon::checkAssociationDeviceWithGLContext(OCLGLHandle& hGL) {
+  bool ret = false;
+  size_t devicesSize = 0;
+  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
+                                        (cl_context_properties)platform_,
+                                        CL_GL_CONTEXT_KHR,
+                                        (cl_context_properties)hGL->hglrc,
+                                        CL_WGL_HDC_KHR,
+                                        (cl_context_properties)hGL->hdc,
+                                        0};
+
+  error_ = _wrapper->clGetGLContextInfoKHR(
+      properties, CL_DEVICES_FOR_GL_CONTEXT_KHR, 0, NULL, &devicesSize);
+  if (error_ != CL_SUCCESS) {
+    printf("clGetGLContextInfoKHR failed (%d)\n", error_);
+    return false;
+  }
+
+  cl_uint numDevices = (cl_uint)devicesSize / sizeof(cl_device_id);
+  cl_device_id* interopDevices = (cl_device_id*)malloc(devicesSize);
+
+  error_ =
+      _wrapper->clGetGLContextInfoKHR(properties, CL_DEVICES_FOR_GL_CONTEXT_KHR,
+                                      devicesSize, interopDevices, NULL);
+  if (error_ != CL_SUCCESS) {
+    printf("clGetGLContextInfoKHR failed (%d)\n", error_);
+    free(interopDevices);
+    return false;
+  }
+
+  // Check that current device can be associated with OpenGL context
+  for (unsigned int i = 0; i < numDevices; i++) {
+    if (interopDevices[i] == devices_[_deviceId]) {
+      ret = true;
+      break;
+    }
+  }
+
+  free(interopDevices);
+  return ret;
+}
+
+void OCLGLCommon::createCLContextFromGLContext(OCLGLHandle& hGL) {
+  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
+                                        (cl_context_properties)platform_,
+                                        CL_GL_CONTEXT_KHR,
+                                        (cl_context_properties)hGL->hglrc,
+                                        CL_WGL_HDC_KHR,
+                                        (cl_context_properties)hGL->hdc,
+                                        0};
+
+  // Release current command queue
+  if (cmdQueues_[_deviceId]) {
+    error_ = _wrapper->clReleaseCommandQueue(cmdQueues_[_deviceId]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseCommandQueue() failed");
+  }
+
+  // Release current context
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseContext() failed");
+  }
+
+  // Create new CL context from GL context
+  context_ =
+      clCreateContext(properties, 1, &devices_[_deviceId], NULL, NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext() failed (%d)", error_);
+
+  // Create command queue for new context
+  cmdQueues_[_deviceId] =
+      _wrapper->clCreateCommandQueue(context_, devices_[_deviceId], 0, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed (%d)",
+               error_);
+
+  GLenum glErr = glewInit();
+  CHECK_RESULT((glErr != GLEW_OK), "glewInit() failed");
+}
+
+void OCLGLCommon::makeCurrent(OCLGLHandle hGL) {
+  if (hGL == NULL) {
+    wglMakeCurrent(NULL, NULL);
+  } else {
+    wglMakeCurrent(hGL->hdc, hGL->hglrc);
+  }
+}
+
+void OCLGLCommon::getCLContextPropertiesFromGLContext(
+    const OCLGLHandle hGL, cl_context_properties properties[7]) {
+  if (!properties) return;
+
+  properties[0] = CL_CONTEXT_PLATFORM;
+  properties[1] = (cl_context_properties)platform_;
+  properties[2] = CL_GL_CONTEXT_KHR;
+  properties[3] = (cl_context_properties)hGL->hglrc;
+  properties[4] = CL_WGL_HDC_KHR;
+  properties[5] = (cl_context_properties)hGL->hdc;
+  properties[6] = 0;
+}
@@ -0,0 +1,288 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLTestImp.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cstdio>
+#include <cstring>
+
+/////////////////////////////////////////////////////////////////////////////
+
+static unsigned int crcinit(unsigned int crc);
+static int initializeSeed(void);
+
+/////////////////////////////////////////////////////////////////////////////
+
+OCLutil::Lock OCLTestImp::openDeviceLock;
+OCLutil::Lock OCLTestImp::compileLock;
+
+OCLTestImp::OCLTestImp()
+    : _wrapper(0),
+      _seed(0),
+      error_(0),
+      type_(0),
+      deviceCount_(0),
+      devices_(0),
+      platform_(0),
+      context_(0),
+      program_(0),
+      kernel_(0) {
+  unsigned int i;
+  for (i = 0; i < 256; i++) {
+    _crctab[i] = crcinit(i << 24);
+  }
+  _perfInfo = 0;
+
+  _wrapper = 0;
+  _iterationCnt = 0;
+
+  _seed = initializeSeed();
+
+  _errorMsg = "";
+  _errorFlag = false;
+  type_ = CL_DEVICE_TYPE_GPU;
+}
+
+OCLTestImp::~OCLTestImp() {}
+void OCLTestImp::useCPU() { type_ = CL_DEVICE_TYPE_CPU; }
+void OCLTestImp::open(unsigned int test, char* units, double& conversion,
+                      unsigned int deviceId) {
+  devices_ = 0;
+  context_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  deviceCount_ = 0;
+
+  open(test, units, conversion, deviceId, getPlatformIndex());
+}
+void OCLTestImp::open(unsigned int test, char* units, double& conversion,
+                      unsigned int deviceId, unsigned int platformIndex) {
+  BaseTestImp::open();
+  devices_ = 0;
+  deviceCount_ = 0;
+  context_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  _deviceId = deviceId;
+  _platformIndex = platformIndex;
+
+  cl_uint numPlatforms = 0;
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clGetPlatformIDs failed");
+  CHECK_RESULT((numPlatforms == 0), "No platform found");
+
+  cl_platform_id* platforms = new cl_platform_id[numPlatforms];
+  error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+
+  cl_platform_id platform = 0;
+#if 0
+  for(unsigned int i = 0; i < numPlatforms; ++i)
+  {
+    char buff[200];
+    error_ = _wrapper->clGetPlatformInfo(platforms[i],CL_PLATFORM_VENDOR, sizeof(buff), buff, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+    if(strcmp(buff, "Advanced Micro Devices, Inc.") == 0)
+    {
+      platform = platforms[i];
+      break;
+    }
+  }
+#endif
+  platform = platforms[_platformIndex];
+
+  delete[] platforms;
+
+  CHECK_RESULT((platform == 0), "AMD Platform not found");
+
+  error_ = _wrapper->clGetDeviceIDs(platform, type_, 0, NULL, &deviceCount_);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed");
+
+  devices_ = new cl_device_id[deviceCount_];
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, deviceCount_, devices_, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed");
+
+  cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)platform, 0};
+  context_ = _wrapper->clCreateContext(props, deviceCount_, devices_, NULL, 0,
+                                       &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext failed");
+
+  cl_command_queue cmdQueue;
+  for (unsigned int i = 0; i < deviceCount_; ++i) {
+#ifndef CL_VERSION_2_0
+    cmdQueue = _wrapper->clCreateCommandQueue(
+        context_, devices_[i], CL_QUEUE_PROFILING_ENABLE, &error_);
+#else
+    cl_queue_properties prop[] = {CL_QUEUE_PROPERTIES,
+                                  CL_QUEUE_PROFILING_ENABLE, 0};
+    cmdQueue = _wrapper->clCreateCommandQueueWithProperties(
+        context_, devices_[i], prop, &error_);
+#endif
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed");
+    cmdQueues_.push_back(cmdQueue);
+  }
+  platform_ = platform;
+}
+
+unsigned int OCLTestImp::close() {
+  for (unsigned int i = 0; i < buffers().size(); ++i) {
+    error_ = _wrapper->clReleaseMemObject(buffers()[i]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseMemObject() failed");
+  }
+  buffers_.clear();
+
+  if (kernel_ != 0) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseKernel() failed");
+  }
+
+  if (program_ != 0) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseProgram() failed");
+  }
+
+  for (unsigned int i = 0; i < cmdQueues_.size(); ++i) {
+    error_ = _wrapper->clReleaseCommandQueue(cmdQueues_[i]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseCommandQueue() failed");
+  }
+  cmdQueues_.clear();
+
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseContext() failed");
+  }
+
+  if (devices_) {
+    delete[] devices_;
+  }
+
+  return BaseTestImp::close();
+}
+
+int OCLTestImp::genBitRand(int n) {
+  int rslt;
+  if (n <= 0 || n > 32) {
+    assert(0);
+    rslt = 0;
+  } else if (n < 32) {
+    _seed = _seed * 1103515245 + 12345;
+    /*
+     * return the most-significant n bits; they are the random ones (see
+     * Knuth, Vol 2)
+     */
+    rslt = (_seed & 0x7fffffff) >> (31 - n);
+  } else {
+    rslt = (genBitRand(16) << 16) | genBitRand(16);
+  }
+
+  return rslt;
+}
+
+int OCLTestImp::genIntRand(int a, int b) {
+  int r;
+  int sign = 1;
+  int mySmall;
+  int delta;
+  int bits = 0;
+  int rslt;
+  if (a > b) {
+    mySmall = b;
+    delta = a - b;
+  } else {
+    mySmall = a;
+    delta = b - a;
+  }
+  if (delta == 0) {
+    rslt = a;
+    return (rslt);
+  } else if (delta < 0) {
+    sign = -1;
+    delta = -delta;
+  }
+  delta &= 0x7fffffff;
+  for (r = delta; r > 0; r >>= 1) {
+    bits++;
+  }
+  do {
+    r = genBitRand(bits);
+  } while (r > delta);
+
+  rslt = mySmall + r * sign;
+
+  return (rslt);
+}
+
+void OCLTestImp::setOCLWrapper(OCLWrapper* wrapper) { _wrapper = wrapper; }
+
+/////////////////////////////////////////////////////////////////////////////
+
+#ifdef ATI_OS_WIN
+
+#include <windows.h>
+
+static int initializeSeed(void) {
+  __int64 val;
+  QueryPerformanceCounter((LARGE_INTEGER*)&val);
+  return (int)val;
+}
+
+#endif  // ATI_OS_WIN
+
+/////////////////////////////////////////////////////////////////////////////
+
+#ifdef ATI_OS_LINUX
+
+#include <sys/time.h>
+
+static int initializeSeed(void) {
+  struct timeval t;
+  gettimeofday(&t, 0);
+  return (int)t.tv_usec;
+}
+
+#endif  // ATI_OS_LINUX
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Same CRC32 as used by ogtst
+//
+static const unsigned int CRCMASK = 0x04c11db7;
+
+static unsigned int crcinit(unsigned int crc) {
+  int i;
+  unsigned int ans = crc;
+
+  for (i = 0; i < 8; i++) {
+    if (ans & 0x80000000) {
+      ans = (ans << 1) ^ CRCMASK;
+    } else {
+      ans <<= 1;
+    }
+  }
+  return (ans);
+}
@@ -0,0 +1,70 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLTestListImp.h"
+
+#include <stdlib.h>
+
+#include "OCLTest.h"
+
+//
+//  OCLTestList_TestCount - retrieve the number of tests in the testing module
+//
+unsigned int OCL_CALLCONV OCLTestList_TestCount(void) { return TestListCount; }
+
+//
+//  OCLTestList_TestLibVersion - retrieve the version of test lib in the testing
+//  module
+//
+unsigned int OCL_CALLCONV OCLTestList_TestLibVersion(void) {
+  return TestLibVersion;
+}
+
+//
+//  OCLTestList_TestLibName - retrieve the name of test library
+//
+const char* OCL_CALLCONV OCLTestList_TestLibName(void) { return TestLibName; }
+
+//
+//  OCLTestList_TestName - retrieve the name of the indexed test in the module
+//
+const char* OCL_CALLCONV OCLTestList_TestName(unsigned int testNum) {
+  if (testNum >= OCLTestList_TestCount()) {
+    return NULL;
+  }
+
+  return TestList[testNum].name;
+}
+
+//
+//  OCLTestList_CreateTest - create a test by index
+//
+OCLTest* OCL_CALLCONV OCLTestList_CreateTest(unsigned int testNum) {
+  if (testNum >= OCLTestList_TestCount()) {
+    return NULL;
+  }
+
+  return reinterpret_cast<OCLTest*>((*TestList[testNum].create)());
+}
+
+//
+//  OCLTestList_DestroyTest - destroy a test object
+//
+void OCL_CALLCONV OCLTestList_DestroyTest(OCLTest* test) { delete test; }
@@ -0,0 +1,46 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLTestUtils.h"
+
+#include <fstream>
+#include <iostream>
+
+bool loadFile(const char* filename, std::string& s) {
+  size_t size;
+  char* str;
+  std::fstream f(filename, std::fstream::in | std::fstream::binary);
+
+  if (f.is_open()) {
+    size_t fileSize;
+    f.seekg(0, std::fstream::end);
+    size = fileSize = (size_t)f.tellg();
+    f.seekg(0, std::fstream::beg);
+    str = new char[size + 1];
+    f.read(str, fileSize);
+    f.close();
+    str[size] = '\0';
+    s = str;
+    delete[] str;
+    return true;
+  }
+  std::cerr << "Error: failed to open file: " << filename << '\n';
+  return false;
+}
@@ -0,0 +1,209 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+//!
+//! \file OCLThread.cpp
+//!
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "OCL/Thread.h"
+#ifdef ATI_OS_WIN
+#include <process.h>
+#endif
+
+//! pack the function pointer and data inside this struct
+typedef struct __argsToThreadFunc {
+  oclThreadFunc func;
+  void *data;
+
+} argsToThreadFunc;
+
+#ifdef ATI_OS_WIN
+//! Windows thread callback - invokes the callback set by
+//! the application in OCLThread constructor
+unsigned _stdcall win32ThreadFunc(void *args) {
+  argsToThreadFunc *ptr = (argsToThreadFunc *)args;
+  OCLutil::Thread *obj = (OCLutil::Thread *)ptr->data;
+  ptr->func(obj->getData());
+  delete args;
+  return 0;
+}
+#endif
+
+////////////////////////////////////////////////////////////////////
+//!
+//! Constructor for OCLLock
+//!
+OCLutil::Lock::Lock() {
+#ifdef ATI_OS_WIN
+  InitializeCriticalSection(&_cs);
+#else
+  pthread_mutex_init(&_lock, NULL);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////
+//!
+//! Destructor for OCLLock
+//!
+OCLutil::Lock::~Lock() {
+#ifdef ATI_OS_WIN
+  DeleteCriticalSection(&_cs);
+#else
+  pthread_mutex_destroy(&_lock);
+#endif
+}
+
+//////////////////////////////////////////////////////////////
+//!
+//! Try to acquire the lock, wait for the lock if unavailable
+//! else hold the lock and enter the protected area
+//!
+void OCLutil::Lock::lock() {
+#ifdef ATI_OS_WIN
+  EnterCriticalSection(&_cs);
+#else
+  pthread_mutex_lock(&_lock);
+#endif
+}
+
+//////////////////////////////////////////////////////////////
+//!
+//! Try to acquire the lock, if unavailable the function returns
+//! false and returns true if available(enters the critical
+//! section as well in this case).
+//!
+bool OCLutil::Lock::tryLock() {
+#ifdef ATI_OS_WIN
+  return (TryEnterCriticalSection(&_cs) != 0);
+#else
+  return !((bool)pthread_mutex_trylock(&_lock));
+#endif
+}
+
+//////////////////////////////////////////////////////////////
+//!
+//! Unlock the lock
+//!
+void OCLutil::Lock::unlock() {
+#ifdef ATI_OS_WIN
+  LeaveCriticalSection(&_cs);
+#else
+  pthread_mutex_unlock(&_lock);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////
+//!
+//! Constructor for OCLThread
+//!
+OCLutil::Thread::Thread() : _tid(0), _data(0) {
+#ifdef ATI_OS_WIN
+  _ID = 0;
+#else
+#endif
+}
+
+////////////////////////////////////////////////////////////////////
+//!
+//! Destructor for OCLLock
+//!
+OCLutil::Thread::~Thread() {
+#ifdef ATI_OS_WIN
+  CloseHandle(_tid);
+#else
+#endif
+}
+
+//////////////////////////////////////////////////////////////
+//!
+//! Create a new thread and return the status of the operation
+//!
+bool OCLutil::Thread::create(oclThreadFunc func, void *arg) {
+  // Save the data internally
+  _data = arg;
+
+  unsigned int retVal;
+
+  bool verbose = getenv("VERBOSE") != NULL;
+
+#ifdef ATI_OS_WIN
+  // Setup the callback struct for thread function and pass to the
+  // begin thread routine
+  // xxx The following struct is allocated but never freed!!!!
+  argsToThreadFunc *args = new argsToThreadFunc;
+  args->func = func;
+  args->data = this;
+
+  _tid = (HANDLE)_beginthreadex(NULL, 0, win32ThreadFunc, args, 0, &retVal);
+
+  if (verbose) {
+    printf("Thread handle value = %p\n", _tid);
+
+    printf("Done creating thread. Thread id value = %u\n", retVal);
+  }
+#else
+  //! Now create the thread with pointer to self as the data
+  retVal = pthread_create(&_tid, NULL, func, arg);
+
+  if (verbose)
+    printf("Done creating thread. Ret value %d, Self = %u\n", retVal,
+           (unsigned int)pthread_self());
+#endif
+
+  if (retVal != 0) return false;
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////
+//!
+//! Return the thread ID for the current OCLThread
+//!
+unsigned int OCLutil::Thread::getID() {
+#ifdef ATI_OS_WIN
+  return GetCurrentThreadId();
+  // Type cast the thread handle to unsigned in and send it over
+#else
+  return (unsigned int)pthread_self();
+#endif
+}
+
+//////////////////////////////////////////////////////////////
+//!
+//! Wait for this thread to join
+//!
+bool OCLutil::Thread::join() {
+#ifdef ATI_OS_WIN
+  DWORD rc = WaitForSingleObject(_tid, INFINITE);
+
+  if (rc == WAIT_FAILED) {
+    printf("Bad call to function(invalid handle?)\n");
+  }
+#else
+  int rc = pthread_join(_tid, NULL);
+#endif
+
+  if (rc != 0) return false;
+
+  return true;
+}
@@ -0,0 +1,944 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLWrapper.h"
+
+OCLWrapper::OCLWrapper() {
+  clEnqueueWaitSignalAMD_ptr =
+      (clEnqueueWaitSignalAMD_fn)clGetExtensionFunctionAddress(
+          "clEnqueueWaitSignalAMD");
+  clEnqueueWriteSignalAMD_ptr =
+      (clEnqueueWriteSignalAMD_fn)clGetExtensionFunctionAddress(
+          "clEnqueueWriteSignalAMD");
+  clEnqueueMakeBuffersResidentAMD_ptr =
+      (clEnqueueMakeBuffersResidentAMD_fn)clGetExtensionFunctionAddress(
+          "clEnqueueMakeBuffersResidentAMD");
+
+  clUnloadPlatformAMD_ptr =
+      (clUnloadPlatformAMD_fn)clGetExtensionFunctionAddress(
+          "clUnloadPlatformAMD");
+
+  // CL-GL function pointers
+  clGetGLContextInfoKHR_ptr =
+      (clGetGLContextInfoKHR_fn)clGetExtensionFunctionAddress(
+          "clGetGLContextInfoKHR");
+  clCreateFromGLBuffer_ptr =
+      (clCreateFromGLBuffer_fn)clGetExtensionFunctionAddress(
+          "clCreateFromGLBuffer");
+  clCreateFromGLTexture_ptr =
+      (clCreateFromGLTexture_fn)clGetExtensionFunctionAddress(
+          "clCreateFromGLTexture");
+  clCreateFromGLTexture2D_ptr =
+      (clCreateFromGLTexture2D_fn)clGetExtensionFunctionAddress(
+          "clCreateFromGLTexture2D");
+  clCreateFromGLRenderbuffer_ptr =
+      (clCreateFromGLRenderbuffer_fn)clGetExtensionFunctionAddress(
+          "clCreateFromGLRenderbuffer");
+  clGetGLObjectInfo_ptr =
+      (clGetGLObjectInfo_fn)clGetExtensionFunctionAddress("clGetGLObjectInfo");
+  clGetGLTextureInfo_ptr = (clGetGLTextureInfo_fn)clGetExtensionFunctionAddress(
+      "clGetGLTextureInfo");
+  clEnqueueAcquireGLObjects_ptr =
+      (clEnqueueAcquireGLObjects_fn)clGetExtensionFunctionAddress(
+          "clEnqueueAcquireGLObjects");
+  clEnqueueReleaseGLObjects_ptr =
+      (clEnqueueReleaseGLObjects_fn)clGetExtensionFunctionAddress(
+          "clEnqueueReleaseGLObjects");
+
+  // Performance counter function pointers
+  clCreatePerfCounterAMD_ptr =
+      (clCreatePerfCounterAMD_fn)clGetExtensionFunctionAddress(
+          "clCreatePerfCounterAMD");
+  clEnqueueBeginPerfCounterAMD_ptr =
+      (clEnqueueBeginPerfCounterAMD_fn)clGetExtensionFunctionAddress(
+          "clEnqueueBeginPerfCounterAMD");
+  clEnqueueEndPerfCounterAMD_ptr =
+      (clEnqueueEndPerfCounterAMD_fn)clGetExtensionFunctionAddress(
+          "clEnqueueEndPerfCounterAMD");
+  clGetPerfCounterInfoAMD_ptr =
+      (clGetPerfCounterInfoAMD_fn)clGetExtensionFunctionAddress(
+          "clGetPerfCounterInfoAMD");
+  clReleasePerfCounterAMD_ptr =
+      (clReleasePerfCounterAMD_fn)clGetExtensionFunctionAddress(
+          "clReleasePerfCounterAMD");
+  clRetainPerfCounterAMD_ptr =
+      (clRetainPerfCounterAMD_fn)clGetExtensionFunctionAddress(
+          "clRetainPerfCounterAMD");
+  clSetDeviceClockModeAMD_ptr =
+      (clSetDeviceClockModeAMD_fn)clGetExtensionFunctionAddress(
+          "clSetDeviceClockModeAMD");
+}
+
+cl_int OCLWrapper::clGetPlatformIDs(cl_uint num_entries,
+                                    cl_platform_id *platforms,
+                                    cl_uint *num_platforms) {
+  return ::clGetPlatformIDs(num_entries, platforms, num_platforms);
+}
+
+cl_int OCLWrapper::clGetPlatformInfo(cl_platform_id platform,
+                                     cl_platform_info param_name,
+                                     size_t param_value_size, void *param_value,
+                                     size_t *param_value_size_ret) {
+  return ::clGetPlatformInfo(platform, param_name, param_value_size,
+                             param_value, param_value_size_ret);
+}
+
+cl_int OCLWrapper::clGetDeviceIDs(cl_platform_id platform,
+                                  cl_device_type device_type,
+                                  cl_uint num_entries, cl_device_id *devices,
+                                  cl_uint *num_devices) {
+  return ::clGetDeviceIDs(platform, device_type, num_entries, devices,
+                          num_devices);
+}
+
+cl_int OCLWrapper::clGetDeviceInfo(cl_device_id device,
+                                   cl_device_info param_name,
+                                   size_t param_value_size, void *param_value,
+                                   size_t *param_value_size_ret) {
+  return ::clGetDeviceInfo(device, param_name, param_value_size, param_value,
+                           param_value_size_ret);
+}
+
+cl_context OCLWrapper::clCreateContext(
+    cl_context_properties *properties, cl_uint num_devices,
+    const cl_device_id *devices,
+    void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+    void *user_data, cl_int *errcode_ret) {
+  return ::clCreateContext(properties, num_devices, devices, pfn_notify,
+                           user_data, errcode_ret);
+}
+
+cl_context OCLWrapper::clCreateContextFromType(
+    cl_context_properties *properties, cl_device_type device_type,
+    void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+    void *user_data, cl_int *errcode_ret) {
+  return ::clCreateContextFromType(properties, device_type, pfn_notify,
+                                   user_data, errcode_ret);
+}
+
+cl_int OCLWrapper::clRetainContext(cl_context context) {
+  return ::clRetainContext(context);
+}
+
+cl_int OCLWrapper::clReleaseContext(cl_context context) {
+  return ::clReleaseContext(context);
+}
+
+cl_int OCLWrapper::clGetContextInfo(cl_context context,
+                                    cl_context_info param_name,
+                                    size_t param_value_size, void *param_value,
+                                    size_t *param_value_size_ret) {
+  return ::clGetContextInfo(context, param_name, param_value_size, param_value,
+                            param_value_size_ret);
+}
+
+cl_command_queue OCLWrapper::clCreateCommandQueue(
+    cl_context context, cl_device_id device,
+    cl_command_queue_properties properties, cl_int *errcode_ret) {
+#if defined(CL_VERSION_2_0)
+  cl_int err;
+  cl_platform_id pid;
+  bool version20 = true;
+  err = ::clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id),
+                          &pid, NULL);
+  if (err == CL_SUCCESS) {
+    size_t size;
+    char *ver;
+    err = ::clGetPlatformInfo(pid, CL_PLATFORM_VERSION, 0, NULL, &size);
+    if (err == CL_SUCCESS) {
+      ver = new char[size];
+      if (ver) {
+        err = ::clGetPlatformInfo(pid, CL_PLATFORM_VERSION, size, ver, NULL);
+        if (err == CL_SUCCESS) {
+          if (ver[8] == '1') {
+            version20 = false;
+          }
+        }
+        delete[] ver;
+      }
+    }
+  }
+  if (version20) {
+    const cl_queue_properties cprops[] = {
+        CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0};
+    return ::clCreateCommandQueueWithProperties(
+        context, device, properties ? cprops : NULL, errcode_ret);
+  } else {
+    return ::clCreateCommandQueue(context, device, properties, errcode_ret);
+  }
+#else
+  return ::clCreateCommandQueue(context, device, properties, errcode_ret);
+#endif
+}
+
+cl_int OCLWrapper::clRetainCommandQueue(cl_command_queue command_queue) {
+  return ::clRetainCommandQueue(command_queue);
+}
+
+cl_int OCLWrapper::clReleaseCommandQueue(cl_command_queue command_queue) {
+  return ::clReleaseCommandQueue(command_queue);
+}
+
+cl_int OCLWrapper::clGetCommandQueueInfo(cl_command_queue command_queue,
+                                         cl_command_queue_info param_name,
+                                         size_t param_value_size,
+                                         void *param_value,
+                                         size_t *param_value_size_ret) {
+  return ::clGetCommandQueueInfo(command_queue, param_name, param_value_size,
+                                 param_value, param_value_size_ret);
+}
+
+cl_mem OCLWrapper::clCreateBuffer(cl_context context, cl_mem_flags flags,
+                                  size_t size, void *host_ptr,
+                                  cl_int *errcode_ret) {
+  return ::clCreateBuffer(context, flags, size, host_ptr, errcode_ret);
+}
+
+cl_mem OCLWrapper::clCreateImage2D(cl_context context, cl_mem_flags flags,
+                                   const cl_image_format *image_format,
+                                   size_t image_width, size_t image_height,
+                                   size_t image_row_pitch, void *host_ptr,
+                                   cl_int *errcode_ret) {
+  return ::clCreateImage2D(context, flags, image_format, image_width,
+                           image_height, image_row_pitch, host_ptr,
+                           errcode_ret);
+}
+
+cl_mem OCLWrapper::clCreateImage3D(cl_context context, cl_mem_flags flags,
+                                   const cl_image_format *image_format,
+                                   size_t image_width, size_t image_height,
+                                   size_t image_depth, size_t image_row_pitch,
+                                   size_t image_slice_pitch, void *host_ptr,
+                                   cl_int *errcode_ret) {
+  return ::clCreateImage3D(context, flags, image_format, image_width,
+                           image_height, image_depth, image_row_pitch,
+                           image_slice_pitch, host_ptr, errcode_ret);
+}
+
+cl_int OCLWrapper::clRetainMemObject(cl_mem memobj) {
+  return ::clRetainMemObject(memobj);
+}
+
+cl_int OCLWrapper::clReleaseMemObject(cl_mem memobj) {
+  return ::clReleaseMemObject(memobj);
+}
+
+cl_int OCLWrapper::clGetSupportedImageFormats(cl_context context,
+                                              cl_mem_flags flags,
+                                              cl_mem_object_type image_type,
+                                              cl_uint num_entries,
+                                              cl_image_format *image_formats,
+                                              cl_uint *num_image_formats) {
+  return ::clGetSupportedImageFormats(context, flags, image_type, num_entries,
+                                      image_formats, num_image_formats);
+}
+
+cl_int OCLWrapper::clGetMemObjectInfo(cl_mem memobj, cl_mem_info param_name,
+                                      size_t param_value_size,
+                                      void *param_value,
+                                      size_t *param_value_size_ret) {
+  return ::clGetMemObjectInfo(memobj, param_name, param_value_size, param_value,
+                              param_value_size_ret);
+}
+
+cl_int OCLWrapper::clGetImageInfo(cl_mem image, cl_image_info param_name,
+                                  size_t param_value_size, void *param_value,
+                                  size_t *param_value_size_ret) {
+  return ::clGetImageInfo(image, param_name, param_value_size, param_value,
+                          param_value_size_ret);
+}
+
+cl_sampler OCLWrapper::clCreateSampler(cl_context context,
+                                       cl_bool normalized_coords,
+                                       cl_addressing_mode addressing_mode,
+                                       cl_filter_mode filter_mode,
+                                       cl_int *errcode_ret) {
+#ifdef CL_VERSION_2_0
+  const cl_sampler_properties sprops[] = {
+      CL_SAMPLER_NORMALIZED_COORDS,
+      static_cast<cl_sampler_properties>(normalized_coords),
+      CL_SAMPLER_ADDRESSING_MODE,
+      static_cast<cl_sampler_properties>(addressing_mode),
+      CL_SAMPLER_FILTER_MODE,
+      static_cast<cl_sampler_properties>(filter_mode),
+      0};
+  return ::clCreateSamplerWithProperties(context, sprops, errcode_ret);
+#else
+  return ::clCreateSampler(context, normalized_coords, addressing_mode,
+                           filter_mode, errcode_ret);
+#endif
+}
+
+cl_int OCLWrapper::clRetainSampler(cl_sampler sampler) {
+  return ::clRetainSampler(sampler);
+}
+
+cl_int OCLWrapper::clReleaseSampler(cl_sampler sampler) {
+  return ::clReleaseSampler(sampler);
+}
+
+cl_int OCLWrapper::clGetSamplerInfo(cl_sampler sampler,
+                                    cl_sampler_info param_name,
+                                    size_t param_value_size, void *param_value,
+                                    size_t *param_value_size_ret) {
+  return ::clGetSamplerInfo(sampler, param_name, param_value_size, param_value,
+                            param_value_size_ret);
+}
+
+cl_program OCLWrapper::clCreateProgramWithSource(cl_context context,
+                                                 cl_uint count,
+                                                 const char **strings,
+                                                 const size_t *lengths,
+                                                 cl_int *errcode_ret) {
+  return ::clCreateProgramWithSource(context, count, strings, lengths,
+                                     errcode_ret);
+}
+
+cl_program OCLWrapper::clCreateProgramWithBinary(
+    cl_context context, cl_uint num_devices, const cl_device_id *device_list,
+    const size_t *lengths, const unsigned char **binaries,
+    cl_int *binary_status, cl_int *errcode_ret) {
+  return ::clCreateProgramWithBinary(context, num_devices, device_list, lengths,
+                                     binaries, binary_status, errcode_ret);
+}
+
+cl_int OCLWrapper::clRetainProgram(cl_program program) {
+  return ::clRetainProgram(program);
+}
+
+cl_int OCLWrapper::clReleaseProgram(cl_program program) {
+  return ::clReleaseProgram(program);
+}
+
+cl_int OCLWrapper::clBuildProgram(
+    cl_program program, cl_uint num_devices, const cl_device_id *device_list,
+    const char *options,
+    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+    void *user_data) {
+  return ::clBuildProgram(program, num_devices, device_list, options,
+                          pfn_notify, user_data);
+}
+
+cl_int OCLWrapper::clCompileProgram(
+    cl_program program, cl_uint num_devices, const cl_device_id *device_list,
+    const char *options, cl_uint num_input_headers,
+    const cl_program *input_headers, const char **header_include_names,
+    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+    void *user_data) {
+  return ::clCompileProgram(program, num_devices, device_list, options,
+                            num_input_headers, input_headers,
+                            header_include_names, pfn_notify, user_data);
+}
+
+cl_program OCLWrapper::clLinkProgram(
+    cl_context context, cl_uint num_devices, const cl_device_id *device_list,
+    const char *options, cl_uint num_input_programs,
+    const cl_program *input_programs,
+    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+    void *user_data, cl_int *errcode_ret) {
+  return ::clLinkProgram(context, num_devices, device_list, options,
+                         num_input_programs, input_programs, pfn_notify,
+                         user_data, errcode_ret);
+}
+
+cl_int OCLWrapper::clUnloadCompiler(void) { return ::clUnloadCompiler(); }
+
+cl_int OCLWrapper::clGetProgramInfo(cl_program program,
+                                    cl_program_info param_name,
+                                    size_t param_value_size, void *param_value,
+                                    size_t *param_value_size_ret) {
+  return ::clGetProgramInfo(program, param_name, param_value_size, param_value,
+                            param_value_size_ret);
+}
+
+cl_int OCLWrapper::clGetProgramBuildInfo(
+    cl_program program, cl_device_id device, cl_program_build_info param_name,
+    size_t param_value_size, void *param_value, size_t *param_value_size_ret) {
+  return ::clGetProgramBuildInfo(program, device, param_name, param_value_size,
+                                 param_value, param_value_size_ret);
+}
+
+cl_kernel OCLWrapper::clCreateKernel(cl_program program,
+                                     const char *kernel_name,
+                                     cl_int *errcode_ret) {
+  return ::clCreateKernel(program, kernel_name, errcode_ret);
+}
+
+cl_int OCLWrapper::clCreateKernelsInProgram(cl_program program,
+                                            cl_uint num_kernels,
+                                            cl_kernel *kernels,
+                                            cl_uint *num_kernels_ret) {
+  return ::clCreateKernelsInProgram(program, num_kernels, kernels,
+                                    num_kernels_ret);
+}
+
+cl_int OCLWrapper::clRetainKernel(cl_kernel kernel) {
+  return ::clRetainKernel(kernel);
+}
+
+cl_int OCLWrapper::clReleaseKernel(cl_kernel kernel) {
+  return ::clReleaseKernel(kernel);
+}
+
+cl_int OCLWrapper::clSetKernelArg(cl_kernel kernel, cl_uint arg_index,
+                                  size_t arg_size, const void *arg_value) {
+  return ::clSetKernelArg(kernel, arg_index, arg_size, arg_value);
+}
+
+cl_int OCLWrapper::clGetKernelInfo(cl_kernel kernel, cl_kernel_info param_name,
+                                   size_t param_value_size, void *param_value,
+                                   size_t *param_value_size_ret) {
+  return ::clGetKernelInfo(kernel, param_name, param_value_size, param_value,
+                           param_value_size_ret);
+}
+
+cl_int OCLWrapper::clGetKernelWorkGroupInfo(
+    cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name,
+    size_t param_value_size, void *param_value, size_t *param_value_size_ret) {
+  return ::clGetKernelWorkGroupInfo(kernel, device, param_name,
+                                    param_value_size, param_value,
+                                    param_value_size_ret);
+}
+
+cl_int OCLWrapper::clWaitForEvents(cl_uint num_events,
+                                   const cl_event *event_list) {
+  return ::clWaitForEvents(num_events, event_list);
+}
+
+cl_int OCLWrapper::clGetEventInfo(cl_event evnt, cl_event_info param_name,
+                                  size_t param_value_size, void *param_value,
+                                  size_t *param_value_size_ret) {
+  return ::clGetEventInfo(evnt, param_name, param_value_size, param_value,
+                          param_value_size_ret);
+}
+
+cl_int OCLWrapper::clRetainEvent(cl_event evnt) {
+  return ::clRetainEvent(evnt);
+}
+
+cl_int OCLWrapper::clReleaseEvent(cl_event evnt) {
+  return ::clReleaseEvent(evnt);
+}
+
+cl_int OCLWrapper::clGetEventProfilingInfo(cl_event evnt,
+                                           cl_profiling_info param_name,
+                                           size_t param_value_size,
+                                           void *param_value,
+                                           size_t *param_value_size_ret) {
+  return ::clGetEventProfilingInfo(evnt, param_name, param_value_size,
+                                   param_value, param_value_size_ret);
+}
+
+cl_int OCLWrapper::clFlush(cl_command_queue command_queue) {
+  return ::clFlush(command_queue);
+}
+
+cl_int OCLWrapper::clFinish(cl_command_queue command_queue) {
+  return ::clFinish(command_queue);
+}
+
+cl_int OCLWrapper::clEnqueueReadBuffer(cl_command_queue command_queue,
+                                       cl_mem buffer, cl_bool blocking_read,
+                                       size_t offset, size_t cb, void *ptr,
+                                       cl_uint num_events_in_wait_list,
+                                       const cl_event *event_wait_list,
+                                       cl_event *evnt) {
+  return ::clEnqueueReadBuffer(command_queue, buffer, blocking_read, offset, cb,
+                               ptr, num_events_in_wait_list, event_wait_list,
+                               evnt);
+}
+
+cl_int OCLWrapper::clEnqueueWriteBuffer(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
+    size_t offset, size_t cb, const void *ptr, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *evnt) {
+  return ::clEnqueueWriteBuffer(command_queue, buffer, blocking_write, offset,
+                                cb, ptr, num_events_in_wait_list,
+                                event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueCopyBuffer(cl_command_queue command_queue,
+                                       cl_mem src_buffer, cl_mem dst_buffer,
+                                       size_t src_offset, size_t dst_offset,
+                                       size_t cb,
+                                       cl_uint num_events_in_wait_list,
+                                       const cl_event *event_wait_list,
+                                       cl_event *evnt) {
+  return ::clEnqueueCopyBuffer(command_queue, src_buffer, dst_buffer,
+                               src_offset, dst_offset, cb,
+                               num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueReadBufferRect(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
+    const size_t *buffer_origin, const size_t *host_origin,
+    const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
+    size_t host_row_pitch, size_t host_slice_pitch, void *ptr,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *evnt) {
+  return ::clEnqueueReadBufferRect(
+      command_queue, buffer, blocking_read, buffer_origin, host_origin, region,
+      buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch,
+      ptr, num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueWriteBufferRect(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
+    const size_t *buffer_origin, const size_t *host_origin,
+    const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
+    size_t host_row_pitch, size_t host_slice_pitch, const void *ptr,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *evnt) {
+  return ::clEnqueueWriteBufferRect(
+      command_queue, buffer, blocking_write, buffer_origin, host_origin, region,
+      buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch,
+      ptr, num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueCopyBufferRect(
+    cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
+    const size_t *src_origin, const size_t *dst_origin, const size_t *region,
+    size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch,
+    size_t dst_slice_pitch, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *evnt) {
+  return ::clEnqueueCopyBufferRect(
+      command_queue, src_buffer, dst_buffer, src_origin, dst_origin, region,
+      src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch,
+      num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueReadImage(
+    cl_command_queue command_queue, cl_mem image, cl_bool blocking_read,
+    const size_t *origin, const size_t *region, size_t row_pitch,
+    size_t slice_pitch, void *ptr, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *evnt) {
+  return ::clEnqueueReadImage(command_queue, image, blocking_read, origin,
+                              region, row_pitch, slice_pitch, ptr,
+                              num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueWriteImage(
+    cl_command_queue command_queue, cl_mem image, cl_bool blocking_write,
+    const size_t *origin, const size_t *region, size_t input_row_pitch,
+    size_t input_slice_pitch, const void *ptr, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *evnt) {
+  return ::clEnqueueWriteImage(command_queue, image, blocking_write, origin,
+                               region, input_row_pitch, input_slice_pitch, ptr,
+                               num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueCopyImage(
+    cl_command_queue command_queue, cl_mem src_image, cl_mem dst_image,
+    const size_t *src_origin, const size_t *dst_origin, const size_t *region,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *evnt) {
+  return ::clEnqueueCopyImage(command_queue, src_image, dst_image, src_origin,
+                              dst_origin, region, num_events_in_wait_list,
+                              event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueCopyImageToBuffer(
+    cl_command_queue command_queue, cl_mem src_image, cl_mem dst_buffer,
+    const size_t *src_origin, const size_t *region, size_t dst_offset,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *evnt) {
+  return ::clEnqueueCopyImageToBuffer(
+      command_queue, src_image, dst_buffer, src_origin, region, dst_offset,
+      num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueCopyBufferToImage(
+    cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image,
+    size_t src_offset, const size_t *dst_origin, const size_t *region,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *evnt) {
+  return ::clEnqueueCopyBufferToImage(
+      command_queue, src_buffer, dst_image, src_offset, dst_origin, region,
+      num_events_in_wait_list, event_wait_list, evnt);
+}
+
+void *OCLWrapper::clEnqueueMapBuffer(cl_command_queue command_queue,
+                                     cl_mem buffer, cl_bool blocking_map,
+                                     cl_map_flags map_flags, size_t offset,
+                                     size_t cb, cl_uint num_events_in_wait_list,
+                                     const cl_event *event_wait_list,
+                                     cl_event *evnt, cl_int *errcode_ret) {
+  return ::clEnqueueMapBuffer(command_queue, buffer, blocking_map, map_flags,
+                              offset, cb, num_events_in_wait_list,
+                              event_wait_list, evnt, errcode_ret);
+}
+
+void *OCLWrapper::clEnqueueMapImage(
+    cl_command_queue command_queue, cl_mem image, cl_bool blocking_map,
+    cl_map_flags map_flags, const size_t *origin, const size_t *region,
+    size_t *image_row_pitch, size_t *image_slice_pitch,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *evnt, cl_int *errcode_ret) {
+  return ::clEnqueueMapImage(command_queue, image, blocking_map, map_flags,
+                             origin, region, image_row_pitch, image_slice_pitch,
+                             num_events_in_wait_list, event_wait_list, evnt,
+                             errcode_ret);
+}
+
+cl_int OCLWrapper::clEnqueueUnmapMemObject(cl_command_queue command_queue,
+                                           cl_mem memobj, void *mapped_ptr,
+                                           cl_uint num_events_in_wait_list,
+                                           const cl_event *event_wait_list,
+                                           cl_event *evnt) {
+  return ::clEnqueueUnmapMemObject(command_queue, memobj, mapped_ptr,
+                                   num_events_in_wait_list, event_wait_list,
+                                   evnt);
+}
+
+cl_int OCLWrapper::clEnqueueNDRangeKernel(
+    cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim,
+    const size_t *global_work_offset, const size_t *global_work_size,
+    const size_t *local_work_size, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *evnt) {
+  return ::clEnqueueNDRangeKernel(
+      command_queue, kernel, work_dim, global_work_offset, global_work_size,
+      local_work_size, num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueTask(cl_command_queue command_queue,
+                                 cl_kernel kernel,
+                                 cl_uint num_events_in_wait_list,
+                                 const cl_event *event_wait_list,
+                                 cl_event *evnt) {
+#if defined(CL_VERSION_2_0)
+  static size_t const globalWorkSize[3] = {1, 0, 0};
+  static size_t const localWorkSize[3] = {1, 0, 0};
+
+  return ::clEnqueueNDRangeKernel(
+      command_queue, kernel, 1, NULL, globalWorkSize, localWorkSize,
+      num_events_in_wait_list, event_wait_list, evnt);
+#else
+  return ::clEnqueueTask(command_queue, kernel, num_events_in_wait_list,
+                         event_wait_list, evnt);
+#endif
+}
+
+cl_int OCLWrapper::clEnqueueNativeKernel(
+    cl_command_queue command_queue, void(CL_CALLBACK *user_func)(void *),
+    void *args, size_t cb_args, cl_uint num_mem_objects, const cl_mem *mem_list,
+    const void **args_mem_loc, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *evnt) {
+  return ::clEnqueueNativeKernel(
+      command_queue, user_func, args, cb_args, num_mem_objects, mem_list,
+      args_mem_loc, num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueMarker(cl_command_queue command_queue,
+                                   cl_event *evnt) {
+  return ::clEnqueueMarker(command_queue, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueMarkerWithWaitList(cl_command_queue command_queue,
+                                               cl_uint num_events_in_wait_list,
+                                               const cl_event *event_wait_list,
+                                               cl_event *evnt) {
+  return ::clEnqueueMarkerWithWaitList(command_queue, num_events_in_wait_list,
+                                       event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueWaitForEvents(cl_command_queue command_queue,
+                                          cl_uint num_events,
+                                          const cl_event *event_list) {
+  return ::clEnqueueWaitForEvents(command_queue, num_events, event_list);
+}
+
+cl_int OCLWrapper::clEnqueueBarrier(cl_command_queue command_queue) {
+  return ::clEnqueueBarrier(command_queue);
+}
+
+void *OCLWrapper::clGetExtensionFunctionAddress(const char *func_name) {
+  return ::clGetExtensionFunctionAddress(func_name);
+}
+
+cl_mem OCLWrapper::clCreateImage(cl_context context, cl_mem_flags flags,
+                                 const cl_image_format *image_format,
+                                 const cl_image_desc *image_desc,
+                                 void *host_ptr, cl_int *errcode_ret) {
+  return ::clCreateImage(context, flags, image_format, image_desc, host_ptr,
+                         errcode_ret);
+}
+
+cl_mem OCLWrapper::clCreateSubBuffer(cl_mem mem, cl_mem_flags flags,
+                                     cl_buffer_create_type buffer_create_type,
+                                     const void *buffer_create_info,
+                                     cl_int *errcode_ret) {
+  return ::clCreateSubBuffer(mem, flags, buffer_create_type, buffer_create_info,
+                             errcode_ret);
+}
+
+cl_int OCLWrapper::clSetEventCallback(
+    cl_event event, cl_int command_exec_callback_type,
+    void(CL_CALLBACK *pfn_event_notify)(cl_event event,
+                                        cl_int event_command_exec_status,
+                                        void *user_data),
+    void *user_data) {
+  return ::clSetEventCallback(event, command_exec_callback_type,
+                              pfn_event_notify, user_data);
+}
+
+cl_int OCLWrapper::clEnqueueFillImage(
+    cl_command_queue command_queue, cl_mem image, void *ptr,
+    const size_t *origin, const size_t *region, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *evnt) {
+  return ::clEnqueueFillImage(command_queue, image, ptr, origin, region,
+                              num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clUnloadPlatformAMD(cl_platform_id id) {
+  if (clUnloadPlatformAMD_ptr) return clUnloadPlatformAMD_ptr(id);
+  return CL_SUCCESS;
+}
+cl_int OCLWrapper::clEnqueueWaitSignalAMD(cl_command_queue command_queue,
+                                          cl_mem mem_object, cl_uint value,
+                                          cl_uint num_events,
+                                          const cl_event *event_wait_list,
+                                          cl_event *event) {
+  return clEnqueueWaitSignalAMD_ptr(command_queue, mem_object, value,
+                                    num_events, event_wait_list, event);
+}
+
+cl_int OCLWrapper::clEnqueueWriteSignalAMD(cl_command_queue command_queue,
+                                           cl_mem mem_object, cl_uint value,
+                                           cl_ulong offset, cl_uint num_events,
+                                           const cl_event *event_list,
+                                           cl_event *event) {
+  return clEnqueueWriteSignalAMD_ptr(command_queue, mem_object, value, offset,
+                                     num_events, event_list, event);
+}
+
+cl_int OCLWrapper::clEnqueueMakeBuffersResidentAMD(
+    cl_command_queue command_queue, cl_uint num_mem_objs, cl_mem *mem_objects,
+    cl_bool blocking_make_resident, cl_bus_address_amd *bus_addresses,
+    cl_uint num_events, const cl_event *event_list, cl_event *event) {
+  return clEnqueueMakeBuffersResidentAMD_ptr(
+      command_queue, num_mem_objs, mem_objects, blocking_make_resident,
+      bus_addresses, num_events, event_list, event);
+}
+
+cl_int OCLWrapper::clEnqueueMigrateMemObjects(cl_command_queue command_queue,
+                                              cl_uint num_mem_objects,
+                                              const cl_mem *mem_objects,
+                                              cl_mem_migration_flags flags,
+                                              cl_uint num_events_in_wait_list,
+                                              const cl_event *event_wait_list,
+                                              cl_event *event) {
+  return ::clEnqueueMigrateMemObjects(
+      command_queue, num_mem_objects, mem_objects, flags,
+      num_events_in_wait_list, event_wait_list, event);
+}
+
+cl_int OCLWrapper::clGetGLContextInfoKHR(
+    const cl_context_properties *properties, cl_gl_context_info param_name,
+    size_t param_value_size, void *param_value, size_t *param_value_size_ret) {
+  return (*clGetGLContextInfoKHR_ptr)(properties, param_name, param_value_size,
+                                      param_value, param_value_size_ret);
+}
+
+cl_mem OCLWrapper::clCreateFromGLBuffer(cl_context context, cl_mem_flags flags,
+                                        unsigned int bufobj, int *errcode_ret) {
+  return (*clCreateFromGLBuffer_ptr)(context, flags, bufobj, errcode_ret);
+}
+
+cl_mem OCLWrapper::clCreateFromGLTexture(cl_context context, cl_mem_flags flags,
+                                         unsigned int texture_target,
+                                         int miplevel, unsigned int texture,
+                                         cl_int *errcode_ret) {
+  return (*clCreateFromGLTexture_ptr)(context, flags, texture_target, miplevel,
+                                      texture, errcode_ret);
+}
+
+cl_mem OCLWrapper::clCreateFromGLTexture2D(cl_context context,
+                                           cl_mem_flags flags,
+                                           unsigned int texture_target,
+                                           int miplevel, unsigned int texture,
+                                           cl_int *errcode_ret) {
+  return (*clCreateFromGLTexture2D_ptr)(context, flags, texture_target,
+                                        miplevel, texture, errcode_ret);
+}
+
+cl_mem OCLWrapper::clCreateFromGLRenderbuffer(cl_context context,
+                                              cl_mem_flags flags,
+                                              unsigned int renderbuffer,
+                                              cl_int *errcode_ret) {
+  return (*clCreateFromGLRenderbuffer_ptr)(context, flags, renderbuffer,
+                                           errcode_ret);
+}
+
+cl_int OCLWrapper::clGetGLObjectInfo(cl_mem memobj,
+                                     cl_gl_object_type *gl_object_type,
+                                     unsigned int *gl_object_name) {
+  return (*clGetGLObjectInfo_ptr)(memobj, gl_object_type, gl_object_name);
+}
+
+cl_int OCLWrapper::clGetGLTextureInfo(cl_mem memobj,
+                                      cl_gl_texture_info param_name,
+                                      size_t param_value_size,
+                                      void *param_value,
+                                      size_t *param_value_size_ret) {
+  return (*clGetGLTextureInfo_ptr)(memobj, param_name, param_value_size,
+                                   param_value, param_value_size_ret);
+}
+
+cl_int OCLWrapper::clEnqueueAcquireGLObjects(cl_command_queue command_queue,
+                                             cl_uint num_objects,
+                                             const cl_mem *mem_objects,
+                                             cl_uint num_events_in_wait_list,
+                                             const cl_event *event_wait_list,
+                                             cl_event *event) {
+  return (*clEnqueueAcquireGLObjects_ptr)(command_queue, num_objects,
+                                          mem_objects, num_events_in_wait_list,
+                                          event_wait_list, event);
+}
+
+cl_int OCLWrapper::clEnqueueReleaseGLObjects(cl_command_queue command_queue,
+                                             cl_uint num_objects,
+                                             const cl_mem *mem_objects,
+                                             cl_uint num_events_in_wait_list,
+                                             const cl_event *event_wait_list,
+                                             cl_event *event) {
+  return (*clEnqueueReleaseGLObjects_ptr)(command_queue, num_objects,
+                                          mem_objects, num_events_in_wait_list,
+                                          event_wait_list, event);
+}
+
+#if defined(CL_VERSION_2_0)
+cl_command_queue OCLWrapper::clCreateCommandQueueWithProperties(
+    cl_context context, cl_device_id device,
+    const cl_queue_properties *properties, cl_int *errcode_ret) {
+  return ::clCreateCommandQueueWithProperties(context, device, properties,
+                                              errcode_ret);
+}
+
+void *OCLWrapper::clSVMAlloc(cl_context context, cl_svm_mem_flags flags,
+                             size_t size, cl_uint alignment) {
+  return ::clSVMAlloc(context, flags, size, alignment);
+}
+
+void OCLWrapper::clSVMFree(cl_context context, void *svm_pointer) {
+  return ::clSVMFree(context, svm_pointer);
+}
+
+cl_int OCLWrapper::clEnqueueSVMMap(cl_command_queue command_queue,
+                                   cl_bool blocking_map, cl_map_flags flags,
+                                   void *svm_ptr, size_t size,
+                                   cl_uint num_events_in_wait_list,
+                                   const cl_event *event_wait_list,
+                                   cl_event *event) {
+  return ::clEnqueueSVMMap(command_queue, blocking_map, flags, svm_ptr, size,
+                           num_events_in_wait_list, event_wait_list, event);
+}
+
+cl_int OCLWrapper::clEnqueueSVMUnmap(cl_command_queue command_queue,
+                                     void *svm_ptr,
+                                     cl_uint num_events_in_wait_list,
+                                     const cl_event *event_wait_list,
+                                     cl_event *event) {
+  return ::clEnqueueSVMUnmap(command_queue, svm_ptr, num_events_in_wait_list,
+                             event_wait_list, event);
+}
+cl_int OCLWrapper::clEnqueueSVMMemFill(cl_command_queue command_queue,
+                                       void *svm_ptr, const void *pattern,
+                                       size_t pattern_size, size_t size,
+                                       cl_uint num_events_in_wait_list,
+                                       const cl_event *event_wait_list,
+                                       cl_event *event) {
+  return ::clEnqueueSVMMemFill(command_queue, svm_ptr, pattern, pattern_size,
+                               size, num_events_in_wait_list, event_wait_list,
+                               event);
+}
+
+cl_int OCLWrapper::clSetKernelArgSVMPointer(cl_kernel kernel, cl_uint arg_index,
+                                            const void *arg_value) {
+  return ::clSetKernelArgSVMPointer(kernel, arg_index, arg_value);
+}
+
+cl_mem OCLWrapper::clCreatePipe(cl_context context, cl_mem_flags flags,
+                                cl_uint packet_size, cl_uint pipe_max_packets,
+                                const cl_pipe_properties *properties,
+                                cl_int *errcode_ret) {
+  return ::clCreatePipe(context, flags, packet_size, pipe_max_packets,
+                        properties, errcode_ret);
+}
+
+cl_int OCLWrapper::clGetPipeInfo(cl_mem pipe, cl_pipe_info param_name,
+                                 size_t param_value_size, void *param_value,
+                                 size_t *param_value_size_ret) {
+  return ::clGetPipeInfo(pipe, param_name, param_value_size, param_value,
+                         param_value_size_ret);
+}
+
+#endif
+
+cl_perfcounter_amd OCLWrapper::clCreatePerfCounterAMD(
+    cl_device_id device, cl_perfcounter_property *properties,
+    cl_int *errcode_ret) {
+  return (*clCreatePerfCounterAMD_ptr)(device, properties, errcode_ret);
+}
+
+cl_int OCLWrapper::clEnqueueBeginPerfCounterAMD(
+    cl_command_queue command_queue, cl_uint num_perf_counters,
+    cl_perfcounter_amd *perf_counters, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event) {
+  return (*clEnqueueBeginPerfCounterAMD_ptr)(
+      command_queue, num_perf_counters, perf_counters, num_events_in_wait_list,
+      event_wait_list, event);
+}
+
+cl_int OCLWrapper::clEnqueueEndPerfCounterAMD(cl_command_queue command_queue,
+                                              cl_uint num_perf_counters,
+                                              cl_perfcounter_amd *perf_counters,
+                                              cl_uint num_events_in_wait_list,
+                                              const cl_event *event_wait_list,
+                                              cl_event *event) {
+  return (*clEnqueueEndPerfCounterAMD_ptr)(
+      command_queue, num_perf_counters, perf_counters, num_events_in_wait_list,
+      event_wait_list, event);
+}
+
+cl_int OCLWrapper::clGetPerfCounterInfoAMD(cl_perfcounter_amd perf_counter,
+                                           cl_perfcounter_info param_name,
+                                           size_t param_value_size,
+                                           void *param_value,
+                                           size_t *param_value_size_ret) {
+  return (*clGetPerfCounterInfoAMD_ptr)(perf_counter, param_name,
+                                        param_value_size, param_value,
+                                        param_value_size_ret);
+}
+
+cl_int OCLWrapper::clReleasePerfCounterAMD(cl_perfcounter_amd perf_counter) {
+  return (*clReleasePerfCounterAMD_ptr)(perf_counter);
+}
+
+cl_int OCLWrapper::clRetainPerfCounterAMD(cl_perfcounter_amd perf_counter) {
+  return (*clRetainPerfCounterAMD_ptr)(perf_counter);
+}
+
+cl_int OCLWrapper::clSetDeviceClockModeAMD(
+    cl_device_id device,
+    cl_set_device_clock_mode_input_amd set_clock_mode_input,
+    cl_set_device_clock_mode_output_amd *set_clock_mode_output) {
+  return (*clSetDeviceClockModeAMD_ptr)(device, set_clock_mode_input,
+                                        set_clock_mode_output);
+}
@@ -0,0 +1,112 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "Timer.h"
+
+#ifdef ATI_OS_WIN
+#include <windows.h>
+#endif
+
+#ifdef ATI_OS_LINUX
+#include <time.h>
+#define NANOSECONDS_PER_SEC 1000000000
+#endif
+
+CPerfCounter::CPerfCounter() : _clocks(0), _start(0) {
+#ifdef ATI_OS_WIN
+
+  QueryPerformanceFrequency((LARGE_INTEGER *)&_freq);
+
+#endif
+
+#ifdef ATI_OS_LINUX
+  _freq = NANOSECONDS_PER_SEC;
+#endif
+}
+
+CPerfCounter::~CPerfCounter() {
+  // EMPTY!
+}
+
+void CPerfCounter::Start(void) {
+#ifdef ATI_OS_WIN
+
+  if (_start) {
+    MessageBox(NULL, "Bad Perf Counter Start", "Error", MB_OK);
+    exit(0);
+  }
+  QueryPerformanceCounter((LARGE_INTEGER *)&_start);
+
+#endif
+#ifdef ATI_OS_LINUX
+
+  struct timespec s;
+  clock_gettime(CLOCK_MONOTONIC, &s);
+  _start = (i64)s.tv_sec * NANOSECONDS_PER_SEC + (i64)s.tv_nsec;
+
+#endif
+}
+
+void CPerfCounter::Stop(void) {
+  i64 n;
+
+#ifdef ATI_OS_WIN
+
+  if (!_start) {
+    MessageBox(NULL, "Bad Perf Counter Stop", "Error", MB_OK);
+    exit(0);
+  }
+
+  QueryPerformanceCounter((LARGE_INTEGER *)&n);
+
+#endif
+#ifdef ATI_OS_LINUX
+
+  struct timespec s;
+  clock_gettime(CLOCK_MONOTONIC, &s);
+  n = (i64)s.tv_sec * NANOSECONDS_PER_SEC + (i64)s.tv_nsec;
+
+#endif
+
+  n -= _start;
+  _start = 0;
+  _clocks += n;
+}
+
+void CPerfCounter::Reset(void) {
+#ifdef ATI_OS_WIN
+  if (_start) {
+    MessageBox(NULL, "Bad Perf Counter Reset", "Error", MB_OK);
+    exit(0);
+  }
+#endif
+  _clocks = 0;
+}
+
+double CPerfCounter::GetElapsedTime(void) {
+#ifdef ATI_OS_WIN
+  if (_start) {
+    MessageBox(NULL, "Trying to get time while still running.", "Error", MB_OK);
+    exit(0);
+  }
+#endif
+
+  return (double)_clocks / (double)_freq;
+}
@@ -0,0 +1,46 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _TIMER_H_
+#define _TIMER_H_
+
+#ifdef ATI_OS_WIN
+typedef __int64 i64;
+#endif
+#ifdef ATI_OS_LINUX
+typedef long long i64;
+#endif
+
+class CPerfCounter {
+ public:
+  CPerfCounter();
+  ~CPerfCounter();
+  void Start(void);
+  void Stop(void);
+  void Reset(void);
+  double GetElapsedTime(void);
+
+ private:
+  i64 _freq;
+  i64 _clocks;
+  i64 _start;
+};
+
+#endif  // _TIMER_H_
@@ -0,0 +1,236 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLDX11Common.h"
+
+#define D3D_FEATURE_LEVEL_11_1 0xb100
+
+#define INITPFN(x)                                                             \
+  x = (x##_fn)clGetExtensionFunctionAddressForPlatform(platform_, #x);         \
+  if ((x) == NULL) {                                                           \
+    char* buf = (char*)malloc(4096);                                           \
+    _errorFlag = true;                                                         \
+    int rc = snprintf(buf, 4096, "Failed to get function pointer for %s", #x); \
+    assert(rc >= 0 && rc < (int)4096);                                         \
+    printf("%s:%d - %s\n", __FILE__, __LINE__, buf);                           \
+    _errorMsg = std::string(buf);                                              \
+    _crcword += 1;                                                             \
+    free(buf);                                                                 \
+    return;                                                                    \
+  }
+
+OCLDX11Common::OCLDX11Common() : OCLTestImp() {
+  clGetDeviceIDsFromD3D11KHR = NULL;
+  clCreateFromD3D11BufferKHR = NULL;
+  clCreateFromD3D11Texture2DKHR = NULL;
+  clCreateFromD3D11Texture3DKHR = NULL;
+  clEnqueueAcquireD3D11ObjectsKHR = NULL;
+  clEnqueueReleaseD3D11ObjectsKHR = NULL;
+  clGetPlaneFromImageAMD = NULL;
+}
+
+OCLDX11Common::~OCLDX11Common() {}
+
+void OCLDX11Common::ExtensionCheck() {
+  cl_int result = CL_SUCCESS;
+  char extensions[1024];
+
+  result = _wrapper->clGetPlatformInfo(platform_, CL_PLATFORM_EXTENSIONS,
+                                       sizeof(extensions), extensions, NULL);
+  CHECK_RESULT(result != CL_SUCCESS, "Failed to list platform extensions.");
+
+  extensionsAvailable =
+      strstr(extensions, "cl_khr_d3d11_sharing") ? true : false;
+  if (!extensionsAvailable) {
+    printf("cl_khr_d3d11_sharing extension is required for this test!\n");
+  }
+
+  OSVERSIONINFOEX versionInfo = {0};
+  versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
+  versionInfo.dwMajorVersion = 6;
+
+  DWORDLONG conditionMask = 0;
+  VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, VER_GREATER_EQUAL);
+  if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION, conditionMask)) {
+    CHECK_RESULT(!extensionsAvailable,
+                 "Extension should be exported on Windows >= 6");
+  } else {
+    CHECK_RESULT(extensionsAvailable,
+                 "Extension should not be exported on Windows < 6");
+  }
+
+  result = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_EXTENSIONS,
+                                     sizeof(extensions), extensions, NULL);
+  CHECK_RESULT(result != CL_SUCCESS, "Failed to list device extensions.");
+
+  extensionsAvailable = strstr(extensions, "cl_amd_planar_yuv") ? true : false;
+  if (!extensionsAvailable) {
+    printf("cl_amd_planar_yuv extension is required for this test!\n");
+  }
+}
+
+void OCLDX11Common::open(unsigned int test, char* units, double& conversion,
+                         unsigned int deviceId) {
+  // OpenCL Initialization
+  // OCLTestImp::open(test, units, conversion, deviceId);
+  BaseTestImp::open();
+  devices_ = 0;
+  deviceCount_ = 0;
+  context_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  _queue = 0;
+  _deviceId = deviceId;
+
+  dxD3D11Context = NULL;
+  dxD3D11Device = NULL;
+
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test (%d)", error_);
+
+  cl_uint numPlatforms = 0;
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clGetPlatformIDs failed");
+  CHECK_RESULT((numPlatforms == 0), "No platform found");
+
+  cl_platform_id* platforms = new cl_platform_id[numPlatforms];
+  error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+
+  platform_ = platforms[_platformIndex];
+  CHECK_RESULT((platform_ == 0), "AMD Platform not found");
+
+  delete[] platforms;
+
+  error_ = _wrapper->clGetDeviceIDs(platform_, type_, 0, NULL, &deviceCount_);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed");
+
+  devices_ = new cl_device_id[deviceCount_];
+  error_ =
+      _wrapper->clGetDeviceIDs(platform_, type_, deviceCount_, devices_, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed");
+
+  ExtensionCheck();
+  if (!extensionsAvailable) {
+    return;
+  }
+
+  // extract function pointers for exported functions
+  INITPFN(clGetDeviceIDsFromD3D11KHR);
+  INITPFN(clCreateFromD3D11BufferKHR);
+  INITPFN(clCreateFromD3D11Texture2DKHR);
+  INITPFN(clCreateFromD3D11Texture3DKHR);
+  INITPFN(clEnqueueAcquireD3D11ObjectsKHR);
+  INITPFN(clEnqueueReleaseD3D11ObjectsKHR);
+  INITPFN(clGetPlaneFromImageAMD);
+
+  char name[1024] = {0};
+  size_t size = 0;
+
+  if (deviceId >= deviceCount_) {
+    _errorFlag = true;
+    return;
+  }
+
+  HRESULT hr = S_OK;
+
+  UINT createDeviceFlags = 0;
+
+  D3D_FEATURE_LEVEL featureLevels[] = {
+      (D3D_FEATURE_LEVEL)D3D_FEATURE_LEVEL_11_1, D3D_FEATURE_LEVEL_11_0
+
+  };
+  D3D_FEATURE_LEVEL featureLevel;
+  // Create only the device, not the swapchain. We can't create the swapchain
+  // anyways without a handle to a window we explicitly own
+  hr = D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_HARDWARE, NULL,
+                         createDeviceFlags, featureLevels,
+                         _countof(featureLevels), D3D11_SDK_VERSION,
+                         &dxD3D11Device, &featureLevel, &dxD3D11Context);
+
+  if (FAILED(hr)) {
+    hr = D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_HARDWARE, NULL,
+                           createDeviceFlags, featureLevels + 1,
+                           _countof(featureLevels) - 1, D3D11_SDK_VERSION,
+                           &dxD3D11Device, &featureLevel, &dxD3D11Context);
+  }
+  if (FAILED(hr)) {
+    hr = D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_SOFTWARE, NULL,
+                           createDeviceFlags, featureLevels,
+                           _countof(featureLevels), D3D11_SDK_VERSION,
+                           &dxD3D11Device, &featureLevel, &dxD3D11Context);
+  }
+
+  if (FAILED(hr)) {
+    hr = D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_SOFTWARE, NULL,
+                           createDeviceFlags, featureLevels + 1,
+                           _countof(featureLevels) - 1, D3D11_SDK_VERSION,
+                           &dxD3D11Device, &featureLevel, &dxD3D11Context);
+  }
+
+  cl_int status = 0;
+  cl_context_properties cps[7] = {
+      CL_CONTEXT_D3D11_DEVICE_KHR,
+      (cl_context_properties)(ID3D11Device*)dxD3D11Device,
+      CL_CONTEXT_INTEROP_USER_SYNC,
+      CL_FALSE,
+      CL_CONTEXT_PLATFORM,
+      (cl_context_properties)platform_,
+      0};
+  cl_context_properties* cprops = (NULL == platform_) ? NULL : cps;
+
+  cl_uint deviceListSize = 0;
+  clGetDeviceIDsFromD3D11KHR(platform_, CL_D3D11_DEVICE_KHR, dxD3D11Device,
+                             CL_PREFERRED_DEVICES_FOR_D3D11_KHR, 0, NULL,
+                             &deviceListSize);
+
+  std::vector<cl_device_id> devices;
+  devices.resize(deviceListSize);
+  clGetDeviceIDsFromD3D11KHR(platform_, CL_D3D11_DEVICE_KHR, dxD3D11Device,
+                             CL_PREFERRED_DEVICES_FOR_D3D11_KHR, deviceListSize,
+                             &devices[0], NULL);
+
+  bool ret = false;
+  // Check that current device can be associated with OpenGL context
+  for (unsigned int i = 0; i < deviceListSize; i++) {
+    if (devices[i] == devices_[_deviceId]) {
+      ret = true;
+      break;
+    }
+  }
+  if (ret) {
+    char buf[2000];
+    _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS,
+                              sizeof(buf), buf, NULL);
+
+    context_ =
+        clCreateContext(cprops, 1, &devices_[_deviceId], NULL, NULL, &status);
+    _queue = clCreateCommandQueue(context_, devices_[_deviceId], 0, &status);
+  }
+  CHECK_RESULT((ret != true), "Can't find D3D device!");
+}
+
+unsigned int OCLDX11Common::close(void) {
+  clReleaseCommandQueue(_queue);
+  unsigned int retVal = OCLTestImp::close();
+  // deleteDXDevice(hDX_);
+  if (dxD3D11Context) dxD3D11Context->Release();
+  if (dxD3D11Device) dxD3D11Device->Release();
+  return retVal;
+}
@@ -0,0 +1,68 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_DX11_COMMON_H_
+#define _OCL_DX11_COMMON_H_
+
+#include <CL/cl.h>
+#include <CL/cl_d3d11.h>
+
+#include "OCLTestImp.h"
+#include "d3d11.h"
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL* clGetPlaneFromImageAMD_fn)(
+    cl_context /* context */, cl_mem /* mem */, cl_uint /* plane */,
+    cl_int* /* errcode_ret */);
+
+class OCLDX11Common : public OCLTestImp {
+ public:
+  // S///////////////////////////////////////
+  // private initialization and clean-up //
+  /////////////////////////////////////////
+  OCLDX11Common();
+  virtual ~OCLDX11Common();
+  ///////////////////////
+  // virtual interface //
+  ///////////////////////
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual unsigned int close(void);
+
+ protected:
+  bool extensionsAvailable;
+
+  ID3D11Device* dxD3D11Device;
+  ID3D11DeviceContext* dxD3D11Context;
+  ID3D11Texture2D* dxDX11Texture;
+  cl_command_queue _queue;
+
+  clGetDeviceIDsFromD3D11KHR_fn clGetDeviceIDsFromD3D11KHR;
+  clCreateFromD3D11BufferKHR_fn clCreateFromD3D11BufferKHR;
+  clCreateFromD3D11Texture2DKHR_fn clCreateFromD3D11Texture2DKHR;
+  clCreateFromD3D11Texture3DKHR_fn clCreateFromD3D11Texture3DKHR;
+  clEnqueueAcquireD3D11ObjectsKHR_fn clEnqueueAcquireD3D11ObjectsKHR;
+  clEnqueueReleaseD3D11ObjectsKHR_fn clEnqueueReleaseD3D11ObjectsKHR;
+  clGetPlaneFromImageAMD_fn clGetPlaneFromImageAMD;
+
+ private:
+  void ExtensionCheck();
+};
+
+#endif  // _OCL_DX11_COMMON_H_
@@ -0,0 +1,478 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLDX11YUY2.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+#define DXGI_FORMAT_NV12 103
+#define DXGI_FORMAT_P010 104
+#define GROUP_SIZE 256
+
+const static char strKernel[] =
+    "__constant sampler_t imageSampler = CLK_NORMALIZED_COORDS_FALSE | "
+    "CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \n"
+    "__kernel void image2imageCopy(                                            "
+    "                             \n"
+    "   __read_only image2d_t input,                                           "
+    "                             \n"
+    "   __write_only image2d_t output)                                         "
+    "                             \n"
+    "{                                                                         "
+    "                             \n"
+    "   int2 coord = (int2)(get_global_id(0), get_global_id(1));               "
+    "                             \n"
+    "   uint4 temp = read_imageui(input, imageSampler, coord);                 "
+    "                             \n"
+    "   write_imageui(output, coord, temp);                                    "
+    "                             \n"
+    "}                                                                         "
+    "                             \n";
+
+OCLDX11YUY2::OCLDX11YUY2() : OCLDX11Common() {
+  _numSubTests = 4;
+  blockSizeX = GROUP_SIZE;
+  blockSizeY = 1;
+}
+
+OCLDX11YUY2::~OCLDX11YUY2() {}
+
+void OCLDX11YUY2::open(unsigned int test, char *units, double &conversion,
+                       unsigned int deviceId) {
+  dxDX11Texture = 0;
+  clImage2DOut = 0;
+  _openTest = test;
+  // Initialize random number seed
+  srand((unsigned int)time(NULL));
+
+  OCLDX11Common::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+  if (!extensionsAvailable) {
+    return;
+  }
+
+  if (_openTest < 2) {
+    dxFormat = (DXGI_FORMAT)DXGI_FORMAT_NV12;
+    extensionsAvailable = formatSupported();
+    if (!extensionsAvailable) {
+      printf("DXGI_FORMAT_NV12 is required for this test!\n");
+      return;
+    }
+  } else {
+    dxFormat = (DXGI_FORMAT)DXGI_FORMAT_P010;
+    extensionsAvailable = formatSupported();
+    if (!extensionsAvailable) {
+      printf("DXGI_FORMAT_P010 is required for this test!\n");
+      return;
+    }
+  }
+
+  CompileKernel();
+  AllocateOpenCLImage();
+}
+
+void OCLDX11YUY2::run(void) {
+  if (_errorFlag) return;
+  if (!extensionsAvailable) return;
+
+  D3D11_TEXTURE2D_DESC Desc = {0};
+
+  Desc.ArraySize = 1;
+  Desc.BindFlags = 0;
+  Desc.Format = dxFormat;
+  Desc.Width = OCLDX11YUY2::WIDTH;
+  Desc.Height = OCLDX11YUY2::HEIGHT;
+  Desc.MipLevels = 1;
+  Desc.SampleDesc.Count = 1;
+  // Desc.MiscFlags=D3D11_RESOURCE_MISC_SHARED; //MM for fast GPU interop
+  // MM: these flags are incompatible with D3D11_RESOURCE_MISC_SHARED
+  // now we allocate texture without CPU access and if needed use temp texture
+  // (see FromSystemToDX11 and FromDX11ToSystem)
+
+  Desc.Usage = D3D11_USAGE_STAGING;
+  Desc.BindFlags = 0;
+  Desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE | D3D11_CPU_ACCESS_READ;
+
+  ID3D11Texture2D *pTextureTmp;
+  HRESULT hr = dxD3D11Device->CreateTexture2D(&Desc, NULL, &pTextureTmp);
+
+  // fill memory
+  D3D11_MAPPED_SUBRESOURCE LockedRectD11;
+  if (SUCCEEDED(hr)) {
+    hr =
+        dxD3D11Context->Map(pTextureTmp, 0, D3D11_MAP_WRITE, 0, &LockedRectD11);
+  }
+  if (SUCCEEDED(hr)) {
+    // fill memory with something
+    for (int y = 0; y < OCLDX11YUY2::HEIGHT; y++) {
+      BYTE *pLine = (BYTE *)LockedRectD11.pData + y * LockedRectD11.RowPitch;
+
+      BYTE *pLineUV = (BYTE *)LockedRectD11.pData + y * LockedRectD11.RowPitch +
+                      OCLDX11YUY2::HEIGHT * LockedRectD11.RowPitch;
+
+      for (int x = 0; x < OCLDX11YUY2::WIDTH; x++) {
+        *pLine++ = 0x7F;  // Y
+        if (y < OCLDX11YUY2::HEIGHT / 2 && x < OCLDX11YUY2::WIDTH / 2) {
+          *pLineUV++ = 0x1F;  // U
+          *pLineUV++ = 0x2F;  // V
+        }
+      }
+    }
+
+    dxD3D11Context->Unmap(pTextureTmp, 0);
+  }
+  Desc.BindFlags = D3D11_BIND_RENDER_TARGET | D3D11_BIND_SHADER_RESOURCE;
+  Desc.Usage = D3D11_USAGE_DEFAULT;
+  Desc.CPUAccessFlags = 0;
+  Desc.MiscFlags = (_openTest == 0)
+                       ? 0
+                       : D3D11_RESOURCE_MISC_SHARED;  // MM for fast GPU interop
+
+  hr = dxD3D11Device->CreateTexture2D(&Desc, NULL, &dxDX11Texture);
+
+  if (pTextureTmp != NULL) {
+    dxD3D11Context->CopySubresourceRegion(dxDX11Texture, 0, 0, 0, 0,
+                                          pTextureTmp, 0, NULL);
+    pTextureTmp->Release();
+  }
+  testInterop();
+}
+
+void OCLDX11YUY2::AllocateOpenCLImage() {
+  cl_int status = 0;
+
+  cl_image_format format{};
+  format.image_channel_order = CL_R;
+  format.image_channel_data_type =
+      (dxFormat == DXGI_FORMAT_NV12) ? CL_UNSIGNED_INT8 : CL_UNSIGNED_INT16;
+  cl_image_desc descr{};
+  descr.image_type = CL_MEM_OBJECT_IMAGE2D;
+  descr.image_width = WIDTH;
+  descr.image_height = HEIGHT + HEIGHT / 2;
+
+  clImage2DOut = clCreateImage(context_, CL_MEM_WRITE_ONLY, &format, &descr,
+                               NULL, &status);
+  CHECK_RESULT((status != CL_SUCCESS), "AllocateOpenCLImage() failed");
+}
+
+void OCLDX11YUY2::testInterop() {
+  // alloc
+  cl_int clStatus = 0;
+  cl_mem clImage2D =
+      clCreateFromD3D11Texture2DKHR(context_, 0, dxDX11Texture, 0, &clStatus);
+  CHECK_RESULT((clStatus != CL_SUCCESS),
+               "clCreateFromD3D11Texture2DKHR() failed");
+
+  // bring objects to the queue
+  cl_event clEvent = NULL;
+  clEnqueueAcquireD3D11ObjectsKHR(_queue, 1, &clImage2D, 0, NULL, &clEvent);
+  clStatus = clWaitForEvents(1, &clEvent);
+  clReleaseEvent(clEvent);
+
+  CopyOpenCLImage(clImage2D);
+  bool ImageReadWorks = CheckCLImage(clImage2D);
+  bool bKernelWorks = CheckCLImage(clImage2DOut);
+  CHECK_RESULT_NO_RETURN((ImageReadWorks != true),
+                         "CheckCLImage(clImage2D) failed");
+  CHECK_RESULT_NO_RETURN((bKernelWorks != true),
+                         "CheckCLImage(clImage2DOut) failed");
+
+  cl_mem planeY = clGetPlaneFromImageAMD(context_, clImage2D, 0, &clStatus);
+  CHECK_RESULT((clStatus != CL_SUCCESS),
+               "clGetPlaneFromImageAMD(context_,clImage2D,0,&clStatus) failed");
+
+  cl_mem planeUV = clGetPlaneFromImageAMD(context_, clImage2D, 1, &clStatus);
+  CHECK_RESULT((clStatus != CL_SUCCESS),
+               "clGetPlaneFromImageAMD(context_,clImage2D,1,&clStatus) failed");
+
+  bool ImageWorksY = CheckCLImageY(planeY);
+  bool ImageWorksUV = CheckCLImageUV(planeUV);
+
+  clReleaseMemObject(planeY);
+  clReleaseMemObject(planeUV);
+
+  // release
+  clEvent = NULL;
+  // release object from the queue
+  clStatus =
+      clEnqueueReleaseD3D11ObjectsKHR(_queue, 1, &clImage2D, 0, NULL, &clEvent);
+  clStatus = clWaitForEvents(1, &clEvent);
+  clReleaseEvent(clEvent);
+
+  // release mem object
+  clReleaseMemObject(clImage2D);
+
+  CHECK_RESULT_NO_RETURN((ImageWorksY != true), "CheckCLImageY() failed");
+  CHECK_RESULT_NO_RETURN((ImageWorksUV != true), "CheckCLImageUV() failed");
+}
+
+unsigned int OCLDX11YUY2::close(void) {
+  if (clImage2DOut) clReleaseMemObject(clImage2DOut);
+  if (dxDX11Texture) dxDX11Texture->Release();
+  return OCLDX11Common::close();
+}
+
+bool OCLDX11YUY2::CheckCLImage(cl_mem clImage) {
+  cl_int clStatus = 0;
+
+  size_t pitch = 0;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(pitch), &pitch, NULL);
+  pitch *= 2;
+
+  cl_image_format format;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
+
+  size_t height;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
+
+  CHECK_RESULT_NO_RETURN(height != (HEIGHT + HEIGHT / 2),
+                         "CheckCLImage: height!=(HEIGHT+HEIGHT/2)");
+
+  char *pTempBuffer = new char[(HEIGHT + HEIGHT / 2) * pitch];
+
+  size_t origin[] = {0, 0, 0};
+  size_t region[] = {WIDTH, HEIGHT + HEIGHT / 2, 1};
+  clStatus = clEnqueueReadImage(_queue, clImage, 1, origin, region, pitch, 0,
+                                pTempBuffer, 0, 0, 0);
+
+  ::clFinish(_queue);
+
+  // test
+
+  bool bBreak = false;
+  for (int y = 0; y < HEIGHT && !bBreak; y++) {
+    char *pLine = (char *)pTempBuffer + y * pitch;
+    char *pLineUV = (char *)pTempBuffer + y * pitch + HEIGHT * pitch;
+
+    for (int x = 0; x < WIDTH; x++) {
+      if (*pLine != 0x7F)  // Y
+      {
+        bBreak = true;
+        break;
+      }
+      pLine++;
+      if (y < HEIGHT / 2 && x < WIDTH / 2) {
+        if (*pLineUV != 0x1F)  // U
+        {
+          bBreak = true;
+          break;
+        }
+        pLineUV++;
+        if (*pLineUV != 0x2F)  // V
+        {
+          bBreak = true;
+          break;
+        }
+        pLineUV++;
+      }
+    }
+  }
+  delete[] pTempBuffer;
+
+  return !bBreak;
+}
+
+bool OCLDX11YUY2::CheckCLImageY(cl_mem clImage) {
+  cl_int clStatus = 0;
+
+  size_t pitch = 0;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(pitch), &pitch, NULL);
+  pitch *= 2;
+
+  cl_image_format format;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
+
+  size_t height;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
+
+  CHECK_RESULT_NO_RETURN(height != HEIGHT, "CheckCLImageY: height!=HEIGHT");
+
+  char *pTempBuffer = new char[HEIGHT * pitch];
+
+  size_t origin[] = {0, 0, 0};
+  size_t region[] = {WIDTH, HEIGHT, 1};
+  clStatus = clEnqueueReadImage(_queue, clImage, 1, origin, region, pitch, 0,
+                                pTempBuffer, 0, 0, 0);
+
+  ::clFinish(_queue);
+
+  // test
+
+  bool bBreak = false;
+  for (int y = 0; y < HEIGHT && !bBreak; y++) {
+    char *pLine = (char *)pTempBuffer + y * pitch;
+    for (int x = 0; x < WIDTH; x++) {
+      if (*pLine != 0x7F)  // Y
+      {
+        bBreak = true;
+        break;
+      }
+      pLine++;
+    }
+  }
+
+  delete[] pTempBuffer;
+
+  return !bBreak;
+}
+
+bool OCLDX11YUY2::CheckCLImageUV(cl_mem clImage) {
+  cl_int clStatus = 0;
+
+  size_t pitch = 0;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(pitch), &pitch, NULL);
+  pitch *= 2;
+  size_t width = 0;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
+
+  cl_image_format format;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
+
+  size_t height;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
+
+  CHECK_RESULT_NO_RETURN(height != HEIGHT / 2,
+                         "CheckCLImageUV: height!=HEIGHT/2");
+
+  char *pTempBuffer = new char[(HEIGHT / 2) * pitch];
+
+  size_t origin[] = {0, 0, 0};
+  size_t region[] = {WIDTH / 2, HEIGHT / 2, 1};
+  clStatus = clEnqueueReadImage(_queue, clImage, 1, origin, region, pitch, 0,
+                                pTempBuffer, 0, 0, 0);
+
+  ::clFinish(_queue);
+
+  bool bBreak = false;
+  for (int y = 0; y < HEIGHT / 2 && !bBreak; y++) {
+    char *pLineUV = (char *)pTempBuffer + y * pitch;
+    for (int x = 0; x < WIDTH / 2; x++) {
+      if (*pLineUV != 0x1F)  // U
+      {
+        bBreak = true;
+        break;
+      }
+      pLineUV++;
+      if (*pLineUV != 0x2F)  // V
+      {
+        bBreak = true;
+        break;
+      }
+      pLineUV++;
+    }
+  }
+  delete[] pTempBuffer;
+
+  return !bBreak;
+}
+
+void OCLDX11YUY2::CopyOpenCLImage(cl_mem clImageSrc) {
+  cl_int status = 0;
+
+  // Set appropriate arguments to the kernel2D
+
+  // input buffer image
+  status = clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clImageSrc);
+  CHECK_RESULT((status != CL_SUCCESS),
+               "CopyOpenCLImage() failed at "
+               "clSetKernelArg(kernel_,0,sizeof(cl_mem),&clImageSrc)");
+  status = clSetKernelArg(kernel_, 1, sizeof(cl_mem), &clImage2DOut);
+  CHECK_RESULT((status != CL_SUCCESS),
+               "CopyOpenCLImage() failed at "
+               "clSetKernelArg(kernel_,1,sizeof(cl_mem),&clImage2DOut)");
+
+  // Enqueue a kernel run call.
+  size_t global_work_offset[] = {0, 0};
+  size_t globalThreads[] = {WIDTH, HEIGHT + HEIGHT / 2};
+  size_t localThreads[] = {blockSizeX, blockSizeY};
+
+  // status =
+  // clEnqueueNDRangeKernel(_queue,kernel_,2,NULL,globalThreads,localThreads,0,NULL,0);
+  status = clEnqueueNDRangeKernel(_queue, kernel_, 2, NULL, globalThreads, NULL,
+                                  0, NULL, 0);
+  CHECK_RESULT((status != CL_SUCCESS),
+               "CopyOpenCLImage() failed at clEnqueueNDRangeKernel");
+
+  status = clFinish(_queue);
+  CHECK_RESULT((status != CL_SUCCESS), "CopyOpenCLImage() failed at clFinish");
+}
+
+void OCLDX11YUY2::CompileKernel() {
+  cl_int status = 0;
+
+  size_t kernelSize = sizeof(strKernel);
+  const char *strs = (const char *)&strKernel[0];
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strs,
+                                                 &kernelSize, &status);
+
+  status = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL,
+                                    NULL, NULL);
+  if (status != CL_SUCCESS) {
+    if (status == CL_BUILD_PROGRAM_FAILURE) {
+      cl_int logStatus;
+      size_t buildLogSize = 0;
+      logStatus = clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                        CL_PROGRAM_BUILD_LOG, buildLogSize,
+                                        NULL, &buildLogSize);
+      std::string buildLog;
+      buildLog.resize(buildLogSize);
+
+      logStatus = clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                        CL_PROGRAM_BUILD_LOG, buildLogSize,
+                                        &buildLog[0], NULL);
+      printf("%s", buildLog.c_str());
+    }
+    return;
+  }
+  // get a kernel object handle for a kernel with the given name
+  kernel_ = _wrapper->clCreateKernel(program_, "image2imageCopy", &status);
+
+  size_t kernel2DWorkGroupSize = 0;
+  status = clGetKernelWorkGroupInfo(kernel_, devices_[_deviceId],
+                                    CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t),
+                                    &kernel2DWorkGroupSize, 0);
+
+  if ((blockSizeX * blockSizeY) > kernel2DWorkGroupSize) {
+    if (blockSizeX > kernel2DWorkGroupSize) {
+      blockSizeX = kernel2DWorkGroupSize;
+      blockSizeY = 1;
+    }
+  }
+}
+
+bool OCLDX11YUY2::formatSupported() {
+  UINT supported = 0u;
+  dxD3D11Device->CheckFormatSupport(dxFormat, (UINT *)&supported);
+  return supported & D3D11_FORMAT_SUPPORT_TEXTURE2D;
+}
@@ -0,0 +1,56 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_DX11_YUY2_H_
+#define _OCL_DX11_YUY2_H_
+
+#include "OCLDX11Common.h"
+
+class OCLDX11YUY2 : public OCLDX11Common {
+ public:
+  OCLDX11YUY2();
+  virtual ~OCLDX11YUY2();
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ protected:
+  static const unsigned int WIDTH = 1280;
+  static const unsigned int HEIGHT = 720;
+
+  void testInterop();
+  void AllocateOpenCLImage();
+  bool CheckCLImage(cl_mem clImage);
+  bool CheckCLImageY(cl_mem clImage);
+  bool CheckCLImageUV(cl_mem clImage);
+  void CopyOpenCLImage(cl_mem clImageSrc);
+  void CompileKernel();
+  bool formatSupported();
+  void testFormat();
+
+  size_t blockSizeX; /**< Work-group size in x-direction */
+  size_t blockSizeY; /**< Work-group size in y-direction */
+  cl_mem clImage2DOut;
+  DXGI_FORMAT dxFormat;
+};
+
+#endif  // _OCL_DX11_YUY2_H_
@@ -0,0 +1,52 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLTestListImp.h"
+
+//
+// Includes for tests
+//
+#ifdef ATI_OS_WIN
+#include "OCLDX11YUY2.h"
+#endif
+
+//
+//  Helper macro for adding tests
+//
+template <typename T>
+static void* dictionary_CreateTestFunc(void) {
+  return new T();
+}
+
+#define TEST(name) \
+  { #name, &dictionary_CreateTestFunc < name> }
+
+#ifdef ATI_OS_WIN
+
+TestEntry TestList[] = {TEST(OCLDX11YUY2)};
+
+unsigned int TestListCount = sizeof(TestList) / sizeof(TestList[0]);
+#else
+TestEntry TestList[] = {{"void", 0}};
+unsigned int TestListCount = 0;
+
+#endif
+unsigned int TestLibVersion = 0;
+const char* TestLibName = "ocldx";
@@ -0,0 +1 @@
+# all clear
@@ -0,0 +1,220 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLBuffer.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+const static char* strKernel =
+    "__kernel void glbuffer_test( __global uint4 *source, __global uint4 "
+    "*glDest, __global uint4 *clDest)   \n"
+    "{                                                                         "
+    "                             \n"
+    "    int  tid = get_global_id(0);                                          "
+    "                             \n"
+    "    clDest[ tid ] = source[ tid ] + (uint4)(1);                           "
+    "                             \n"
+    "    glDest[ tid ] = source[ tid ] + (uint4)(2);                           "
+    "                             \n"
+    "}                                                                         "
+    "                             \n";
+
+OCLGLBuffer::OCLGLBuffer() : inGLBuffer_(0), outGLBuffer_(0) {
+  _numSubTests = 1;
+}
+
+OCLGLBuffer::~OCLGLBuffer() {}
+
+void OCLGLBuffer::open(unsigned int test, char* units, double& conversion,
+                       unsigned int deviceId) {
+  // Initialize random number seed
+  srand((unsigned int)time(NULL));
+
+  OCLGLCommon::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+
+  // Build the kernel
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateProgramWithSource()  failed (%d)", error_);
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_);
+
+  kernel_ = _wrapper->clCreateKernel(program_, "glbuffer_test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_);
+}
+
+void OCLGLBuffer::run(void) {
+  if (_errorFlag) {
+    return;
+  }
+
+  cl_mem buffer;
+  cl_uint4 inData[c_numOfElements] = {{{0}}};
+  cl_uint4 outDataCL[c_numOfElements] = {{{0}}};
+  cl_uint4 outDataGL[c_numOfElements] = {{{0}}};
+
+  // Initialize input data with random values
+  for (unsigned int i = 0; i < c_numOfElements; i++) {
+    for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) {
+      inData[i].s[j] = (unsigned int)rand();
+    }
+  }
+
+  // Generate and Bind in & out OpenGL buffers
+  glGenBuffers(1, &inGLBuffer_);
+  glGenBuffers(1, &outGLBuffer_);
+
+  glBindBuffer(GL_ARRAY_BUFFER, inGLBuffer_);
+  glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), inData,
+               GL_STATIC_DRAW);
+
+  glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer_);
+  glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), outDataGL,
+               GL_STATIC_DRAW);
+
+  glBindBuffer(GL_ARRAY_BUFFER, 0);
+  glFinish();
+
+  // Create input buffer from GL input buffer
+  buffer = _wrapper->clCreateFromGLBuffer(context_, CL_MEM_READ_ONLY,
+                                          inGLBuffer_, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Unable to create input GL buffer (%d)",
+               error_);
+  buffers_.push_back(buffer);
+
+  // Create output buffer from GL output buffer
+  buffer = _wrapper->clCreateFromGLBuffer(context_, CL_MEM_WRITE_ONLY,
+                                          outGLBuffer_, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Unable to create output GL buffer (%d)",
+               error_);
+  buffers_.push_back(buffer);
+
+  // Create a CL output buffer
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                    c_numOfElements * sizeof(cl_uint4), NULL,
+                                    &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed (%d)", error_);
+  buffers_.push_back(buffer);
+
+  // Assign args and execute
+  for (unsigned int i = 0; i < buffers_.size(); i++) {
+    error_ =
+        _wrapper->clSetKernelArg(kernel_, i, sizeof(cl_mem), &buffers()[i]);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
+                 error_);
+  }
+
+  error_ = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 2,
+                                               &buffers()[0], 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)",
+               error_);
+
+  size_t gws[1] = {c_numOfElements};
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, NULL, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed (%d)",
+               error_);
+
+  error_ = _wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 2,
+                                               &buffers()[0], 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReleaseGLObjects failed (%d)",
+               error_);
+
+  error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clFinish() failed (%d)", error_);
+
+  // Get the results from both CL and GL buffers
+  error_ = _wrapper->clEnqueueReadBuffer(
+      cmdQueues_[_deviceId], buffers()[2], CL_TRUE, 0,
+      c_numOfElements * sizeof(cl_uint4), outDataCL, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Unable to read output CL array! (%d)",
+               error_);
+
+  glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer_);
+  void* glMem = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY);
+  memcpy(outDataGL, glMem, c_numOfElements * sizeof(cl_uint4));
+  glUnmapBuffer(GL_ARRAY_BUFFER);
+
+  cl_uint4 expectedCL = {{0}};
+  cl_uint4 expectedGL = {{0}};
+
+  // Check output
+  for (unsigned int i = 0; i < c_numOfElements; ++i) {
+    // Calculate expected value in CL output buffer (input + 1)
+    expectedCL = inData[i];
+    expectedCL.s[0]++;
+    expectedCL.s[1]++;
+    expectedCL.s[2]++;
+    expectedCL.s[3]++;
+
+    // Calculate expected value in GL output buffer (input + 2)
+    expectedGL = inData[i];
+    expectedGL.s[0] += 2;
+    expectedGL.s[1] += 2;
+    expectedGL.s[2] += 2;
+    expectedGL.s[3] += 2;
+
+    // Compare expected output with actual data received
+    for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) {
+      CHECK_RESULT((outDataCL[i].s[j] != expectedCL.s[j]),
+                   "Element %d in CL output buffer is incorrect!\n\t \
+                         expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}",
+                   i, expectedCL.s[0], expectedCL.s[1], expectedCL.s[2],
+                   expectedCL.s[3], outDataCL[i].s[0], outDataCL[i].s[1],
+                   outDataCL[i].s[2], outDataCL[i].s[3]);
+      CHECK_RESULT((outDataGL[i].s[j] != expectedGL.s[j]),
+                   "Element %d in GL output buffer is incorrect!\n\t \
+                         expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}",
+                   i, expectedGL.s[0], expectedGL.s[1], expectedGL.s[2],
+                   expectedGL.s[3], outDataGL[i].s[0], outDataGL[i].s[1],
+                   outDataGL[i].s[2], outDataGL[i].s[3]);
+    }
+  }
+}
+
+unsigned int OCLGLBuffer::close(void) {
+  for (unsigned int i = 0; i < buffers().size(); ++i) {
+    clReleaseMemObject(buffers()[i]);
+  }
+  buffers_.clear();
+
+  // Delete GL in & out buffers
+  glBindBuffer(GL_ARRAY_BUFFER, 0);
+  glDeleteBuffers(1, &inGLBuffer_);
+  inGLBuffer_ = 0;
+  glDeleteBuffers(1, &outGLBuffer_);
+  outGLBuffer_ = 0;
+
+  return OCLGLCommon::close();
+}
@@ -0,0 +1,42 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GL_BUFFER_H_
+#define _OCL_GL_BUFFER_H_
+
+#include "OCLGLCommon.h"
+
+class OCLGLBuffer : public OCLGLCommon {
+ public:
+  OCLGLBuffer();
+  virtual ~OCLGLBuffer();
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  static const unsigned int c_numOfElements = 1024;
+  GLuint inGLBuffer_;
+  GLuint outGLBuffer_;
+};
+
+#endif  // _OCL_GL_BUFFER_H_
@@ -0,0 +1,303 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLBufferMultipleQueues.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+const static char* strKernel =
+    "__kernel void glbuffer_test( __global uint4 *source, __global uint4 "
+    "*glDest, __global uint4 *clDest)   \n"
+    "{                                                                         "
+    "                             \n"
+    "    int  tid = get_global_id(0);                                          "
+    "                             \n"
+    "    glDest[ tid ] = source[ tid ] + (uint4)(2);                           "
+    "                             \n"
+    "    clDest[ tid ] = source[ tid ] + (uint4)(1);                           "
+    "                             \n"
+    "}                                                                         "
+    "                             \n";
+
+OCLGLBufferMultipleQueues::OCLGLBufferMultipleQueues() { _numSubTests = 1; }
+
+OCLGLBufferMultipleQueues::~OCLGLBufferMultipleQueues() {}
+
+void OCLGLBufferMultipleQueues::open(unsigned int test, char* units,
+                                     double& conversion,
+                                     unsigned int deviceId) {
+  // Initialize random number seed
+  srand((unsigned int)time(NULL));
+
+  OCLGLCommon::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+
+  // Create multiple queues for the device (first add already created queue in
+  // OCLGLCommon::open, then add a second queue)
+  deviceCmdQueues_.resize(QUEUES_PER_DEVICE_COUNT);
+  deviceCmdQueues_[0] = cmdQueues_[deviceId];
+  for (int queueIndex = 1; queueIndex < QUEUES_PER_DEVICE_COUNT; queueIndex++) {
+    cl_command_queue cmdQueue = _wrapper->clCreateCommandQueue(
+        context_, devices_[deviceId], 0, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed");
+    deviceCmdQueues_[queueIndex] = cmdQueue;
+  }
+
+  // Build the kernel
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateProgramWithSource()  failed (%d)", error_);
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_);
+
+  kernel_ = _wrapper->clCreateKernel(program_, "glbuffer_test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_);
+}
+
+void OCLGLBufferMultipleQueues::run(void) {
+  if (_errorFlag) {
+    return;
+  }
+
+  inputGLBufferPerQueue_.resize(QUEUES_PER_DEVICE_COUNT, NULL);
+  outputGLBufferPerQueue_.resize(QUEUES_PER_DEVICE_COUNT, NULL);
+  outputCLBufferPerQueue_.resize(QUEUES_PER_DEVICE_COUNT, NULL);
+
+  std::vector<std::vector<cl_uint4> > inData(
+      QUEUES_PER_DEVICE_COUNT);  // Input data per queue
+
+  inGLBufferIDs_.resize(QUEUES_PER_DEVICE_COUNT, 0);
+  outGLBufferIDs_.resize(QUEUES_PER_DEVICE_COUNT, 0);
+  for (int queueIndex = 0; queueIndex < QUEUES_PER_DEVICE_COUNT; queueIndex++) {
+    // Initialize input data with random values
+    inData[queueIndex].resize(BUFFER_ELEMENTS_COUNT);
+    for (int i = 0; i < BUFFER_ELEMENTS_COUNT; i++) {
+      for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) {
+        inData[queueIndex][i].s[j] = (unsigned int)rand();
+      }
+    }
+
+    // Generate and Bind in & out OpenGL buffers
+    glGenBuffers(1, &inGLBufferIDs_[queueIndex]);
+    glGenBuffers(1, &outGLBufferIDs_[queueIndex]);
+
+    glBindBuffer(GL_ARRAY_BUFFER, inGLBufferIDs_[queueIndex]);
+    glBufferData(GL_ARRAY_BUFFER, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4),
+                 &inData[queueIndex][0], GL_STATIC_DRAW);
+
+    glBindBuffer(GL_ARRAY_BUFFER, outGLBufferIDs_[queueIndex]);
+    glBufferData(GL_ARRAY_BUFFER, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4),
+                 NULL, GL_STATIC_DRAW);
+
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+    glFinish();
+
+    // Create input buffer from GL input buffer
+    inputGLBufferPerQueue_[queueIndex] = _wrapper->clCreateFromGLBuffer(
+        context_, CL_MEM_READ_ONLY, inGLBufferIDs_[queueIndex], &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "Unable to create input GL buffer (%d)", error_);
+
+    // Create output buffer from GL output buffer
+    outputGLBufferPerQueue_[queueIndex] = _wrapper->clCreateFromGLBuffer(
+        context_, CL_MEM_WRITE_ONLY, outGLBufferIDs_[queueIndex], &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "Unable to create output GL buffer (%d)", error_);
+
+    // Create a CL output buffer
+    outputCLBufferPerQueue_[queueIndex] = _wrapper->clCreateBuffer(
+        context_, CL_MEM_WRITE_ONLY, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4),
+        NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed (%d)",
+                 error_);
+  }
+
+  for (int queueIndex = 0; queueIndex < QUEUES_PER_DEVICE_COUNT; queueIndex++) {
+    // Assign arguments to kernel according to queue index
+    error_ = _wrapper->clSetKernelArg(
+        kernel_, 0, sizeof(cl_mem),
+        &inputGLBufferPerQueue_[queueIndex]);  // Input source
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
+                 error_);
+    error_ = _wrapper->clSetKernelArg(
+        kernel_, 1, sizeof(cl_mem),
+        &outputGLBufferPerQueue_[queueIndex]);  // Output glDest
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
+                 error_);
+    error_ = _wrapper->clSetKernelArg(
+        kernel_, 2, sizeof(cl_mem),
+        &outputCLBufferPerQueue_[queueIndex]);  // Output clDest
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
+                 error_);
+
+    // Acquire input GL buffer
+    error_ = _wrapper->clEnqueueAcquireGLObjects(
+        deviceCmdQueues_[queueIndex], 1, &inputGLBufferPerQueue_[queueIndex], 0,
+        NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)",
+                 error_);
+
+    // Acquire output GL buffer
+    error_ = _wrapper->clEnqueueAcquireGLObjects(
+        deviceCmdQueues_[queueIndex], 1, &outputGLBufferPerQueue_[queueIndex],
+        0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)",
+                 error_);
+
+    // Enqueue the kernel
+    size_t gws[1] = {BUFFER_ELEMENTS_COUNT};
+    error_ =
+        _wrapper->clEnqueueNDRangeKernel(deviceCmdQueues_[queueIndex], kernel_,
+                                         1, NULL, gws, NULL, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed (%d)",
+                 error_);
+
+    // Release input GL buffer
+    error_ = _wrapper->clEnqueueReleaseGLObjects(
+        deviceCmdQueues_[queueIndex], 1, &inputGLBufferPerQueue_[queueIndex], 0,
+        NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "clEnqueueReleaseGLObjects failed (%d)", error_);
+
+    // Release output GL buffer
+    error_ = _wrapper->clEnqueueReleaseGLObjects(
+        deviceCmdQueues_[queueIndex], 1, &outputGLBufferPerQueue_[queueIndex],
+        0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "clEnqueueReleaseGLObjects failed (%d)", error_);
+
+    // Flush commands in order to trigger the operations
+    error_ = _wrapper->clFlush(deviceCmdQueues_[queueIndex]);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clFlush() failed (%d)", error_);
+  }
+
+  for (int queueIndex = 0; queueIndex < QUEUES_PER_DEVICE_COUNT; queueIndex++) {
+    // Get the results from CL buffer (in a synchronous manner)
+    cl_uint4 outDataCL[BUFFER_ELEMENTS_COUNT];
+    error_ = _wrapper->clEnqueueReadBuffer(
+        deviceCmdQueues_[queueIndex], outputCLBufferPerQueue_[queueIndex],
+        CL_TRUE, 0, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4), outDataCL, 0,
+        NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "Unable to read output CL array! (%d)",
+                 error_);
+
+    cl_uint4 outDataGL[BUFFER_ELEMENTS_COUNT] = {{{0}}};
+    glBindBuffer(GL_ARRAY_BUFFER, outGLBufferIDs_[queueIndex]);  // why again
+    void* glMem = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY);
+    memcpy(outDataGL, glMem, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4));
+    glUnmapBuffer(GL_ARRAY_BUFFER);
+
+    cl_uint4 expectedCL = {{0}};
+    cl_uint4 expectedGL = {{0}};
+
+    // Check output
+    for (int i = 0; i < BUFFER_ELEMENTS_COUNT; ++i) {
+      // Calculate expected value in CL output buffer (input + 1)
+      expectedCL = inData[queueIndex][i];
+      expectedCL.s[0]++;
+      expectedCL.s[1]++;
+      expectedCL.s[2]++;
+      expectedCL.s[3]++;
+
+      // Calculate expected value in GL output buffer (input + 2)
+      expectedGL = inData[queueIndex][i];
+      expectedGL.s[0] += 2;
+      expectedGL.s[1] += 2;
+      expectedGL.s[2] += 2;
+      expectedGL.s[3] += 2;
+
+      // Compare expected output with actual data received
+      for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) {
+        CHECK_RESULT((outDataCL[i].s[j] != expectedCL.s[j]),
+                     "Element %d in CL output buffer is incorrect!\n\t \
+							 expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}",
+                     i, expectedCL.s[0], expectedCL.s[1], expectedCL.s[2],
+                     expectedCL.s[3], outDataCL[i].s[0], outDataCL[i].s[1],
+                     outDataCL[i].s[2], outDataCL[i].s[3]);
+        CHECK_RESULT((outDataGL[i].s[j] != expectedGL.s[j]),
+                     "Element %d in GL output buffer is incorrect!\n\t \
+							 expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}",
+                     i, expectedGL.s[0], expectedGL.s[1], expectedGL.s[2],
+                     expectedGL.s[3], outDataGL[i].s[0], outDataGL[i].s[1],
+                     outDataGL[i].s[2], outDataGL[i].s[3]);
+      }
+    }
+  }
+}
+
+unsigned int OCLGLBufferMultipleQueues::close(void) {
+  // Release cl buffers (must be done before releasing the associated GL
+  // buffers)
+  for (int bufferIndex = 0; bufferIndex < (int)inputGLBufferPerQueue_.size();
+       bufferIndex++) {
+    error_ = _wrapper->clReleaseMemObject(inputGLBufferPerQueue_[bufferIndex]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseMemObject() failed");
+  }
+
+  for (int bufferIndex = 0; bufferIndex < (int)outputGLBufferPerQueue_.size();
+       bufferIndex++) {
+    error_ = _wrapper->clReleaseMemObject(outputGLBufferPerQueue_[bufferIndex]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseMemObject() failed");
+  }
+
+  for (int bufferIndex = 0; bufferIndex < (int)outputCLBufferPerQueue_.size();
+       bufferIndex++) {
+    error_ = _wrapper->clReleaseMemObject(outputCLBufferPerQueue_[bufferIndex]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseMemObject() failed");
+  }
+
+  // Delete GL in & out buffers
+  glBindBuffer(GL_ARRAY_BUFFER, 0);
+  if (!inGLBufferIDs_.empty()) {
+    glDeleteBuffers((int)inGLBufferIDs_.size(), &inGLBufferIDs_[0]);
+  }
+
+  if (!outGLBufferIDs_.empty()) {
+    glDeleteBuffers((int)outGLBufferIDs_.size(), &outGLBufferIDs_[0]);
+  }
+
+  // Release queues created by open method, the first queue per device is
+  // released by base class
+  for (int queueIndex = 1; queueIndex < (int)deviceCmdQueues_.size();
+       queueIndex++) {
+    error_ = _wrapper->clReleaseCommandQueue(deviceCmdQueues_[queueIndex]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseCommandQueue() failed");
+  }
+  deviceCmdQueues_.clear();
+
+  return OCLGLCommon::close();
+}
@@ -0,0 +1,48 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GL_BUFFER_MULTIPLE_QUEUES_H_
+#define _OCL_GL_BUFFER_MULTIPLE_QUEUES_H_
+
+#include "OCLGLCommon.h"
+
+class OCLGLBufferMultipleQueues : public OCLGLCommon {
+ public:
+  OCLGLBufferMultipleQueues();
+  virtual ~OCLGLBufferMultipleQueues();
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  static const int BUFFER_ELEMENTS_COUNT = 1024;
+  static const int QUEUES_PER_DEVICE_COUNT = 2;
+  std::vector<cl_command_queue>
+      deviceCmdQueues_;  // Multiple queues per device (single device)
+  std::vector<cl_mem> inputGLBufferPerQueue_;   // Input GL buffer per queue
+  std::vector<cl_mem> outputGLBufferPerQueue_;  // Output GL buffer per queue
+  std::vector<cl_mem> outputCLBufferPerQueue_;  // Input CL buffer per queue
+  std::vector<GLuint> inGLBufferIDs_;           // Input GL buffers IDs
+  std::vector<GLuint> outGLBufferIDs_;          // Output GL buffers IDs
+};
+
+#endif  // _OCL_GL_BUFFER_MULTIPLE_QUEUES_H_
@@ -0,0 +1,270 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLDepthBuffer.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+const static char* strKernel =
+    "#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+    "__kernel void gldepths_test( __global float *output, read_only  image2d_t "
+    "source, sampler_t sampler){   \n"
+    "    int  tidX = get_global_id(0);\n"
+    "    int  tidY = get_global_id(1);\n"
+    "    float4 value = read_imagef( source, sampler, (int2)( tidX, tidY ) );\n"
+    "    output[ tidY * get_image_width( source ) + tidX ] =  value.z;\n"
+    "}\n";
+
+OCLGLDepthBuffer::OCLGLDepthBuffer()
+    : glDepthBuffer_(0),
+      frameBufferOBJ_(0),
+      colorBuffer_(0),
+      clOutputBuffer_(0),
+      clDepth_(0),
+      clSampler_(0),
+      pGLOutput_(0),
+      pCLOutput_(0),
+      extensionSupported_(false) {
+  _numSubTests = 2;
+  _currentTest = 0;
+}
+
+OCLGLDepthBuffer::~OCLGLDepthBuffer() {}
+
+void OCLGLDepthBuffer::open(unsigned int test, char* units, double& conversion,
+                            unsigned int deviceId) {
+  OCLGLCommon::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+
+  char* pExtensions = (char*)malloc(8192);
+  size_t returnSize;
+  _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 8192,
+                            pExtensions, &returnSize);
+
+  // if extension if not supported
+  if (!strstr(pExtensions, "cl_khr_gl_depth_images")) {
+    printf("skipping test depth interop not supported\n");
+    free(pExtensions);
+    return;
+  }
+  free(pExtensions);
+  extensionSupported_ = true;
+
+  _currentTest = test;
+
+  // Build the kernel
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateProgramWithSource()  failed (%d)", error_);
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_);
+
+  kernel_ = _wrapper->clCreateKernel(program_, "gldepths_test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_);
+}
+
+void OCLGLDepthBuffer::run(void) {
+  if (_errorFlag || !extensionSupported_) {
+    return;
+  }
+  bool retVal;
+  switch (_currentTest) {
+    case 0:
+      retVal = testDepthRead(GL_DEPTH_COMPONENT32F, GL_DEPTH_ATTACHMENT);
+      break;
+    case 1:
+      retVal = testDepthRead(GL_DEPTH_COMPONENT16, GL_DEPTH_ATTACHMENT);
+      break;
+    case 2:
+      retVal = testDepthRead(GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL_ATTACHMENT);
+      break;
+    case 3:
+      retVal = testDepthRead(GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL_ATTACHMENT);
+      break;
+    default:
+      CHECK_RESULT(true, "unsupported test number\n");
+  }
+  CHECK_RESULT((retVal != true), "cl-gl depth test failed ");
+}
+
+bool OCLGLDepthBuffer::testDepthRead(GLint internalFormat,
+                                     GLenum attachmentType) {
+  cl_int error;
+  size_t dimSizes[] = {c_dimSize, c_dimSize};
+
+  unsigned int bufferSize = c_dimSize * c_dimSize * 4;
+  bool retVal = false;
+
+  pGLOutput_ = (float*)malloc(bufferSize);
+  pCLOutput_ = (float*)malloc(bufferSize);
+  // create Frame buffer object
+  glGenFramebuffers(1, &frameBufferOBJ_);
+
+  // create   textures
+  glGenTextures(1, &colorBuffer_);
+  glEnable(GL_TEXTURE_2D);
+  glBindTexture(GL_TEXTURE_2D, colorBuffer_);
+  glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, c_dimSize, c_dimSize, 0, GL_RGBA,
+               GL_UNSIGNED_BYTE, 0);
+  glBindTexture(GL_TEXTURE_2D, 0);
+  // create a renderbuffer for the depth/stencil buffer
+  glGenRenderbuffers(1, &glDepthBuffer_);
+  glBindRenderbuffer(GL_RENDERBUFFER, glDepthBuffer_);
+  glRenderbufferStorage(GL_RENDERBUFFER, internalFormat, c_dimSize, c_dimSize);
+
+  //
+  glBindFramebuffer(GL_FRAMEBUFFER, frameBufferOBJ_);
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, colorBuffer_, 0);
+  glFramebufferRenderbuffer(GL_FRAMEBUFFER, attachmentType, GL_RENDERBUFFER,
+                            glDepthBuffer_);
+
+  GLenum status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
+  if (GL_FRAMEBUFFER_COMPLETE != status) {
+    return false;
+  }
+  // set up gl state machine
+  glViewport(0, 0, c_dimSize, c_dimSize);  // Reset The Current Viewport
+  glMatrixMode(GL_PROJECTION);             // Select The Projection Matrix
+  glLoadIdentity();                        // Reset The Projection Matrix
+  gluPerspective(30.0f, (GLfloat)c_dimSize / (GLfloat)c_dimSize, 0.1f, 100.0f);
+  glMatrixMode(GL_MODELVIEW);  // Select The Modelview Matrix
+  glLoadIdentity();
+  glEnable(GL_DEPTH_TEST);
+  // The Type Of Depth Testing To Do
+  glClear(GL_COLOR_BUFFER_BIT |
+          GL_DEPTH_BUFFER_BIT);     // Clear Screen And Depth Buffer
+  glBegin(GL_QUADS);                // Draw A Quad
+  glVertex3f(-1.0f, 1.0f, -6.0f);   // Top Left
+  glVertex3f(1.0f, 1.0f, -6.0f);    // Top Right
+  glVertex3f(1.0f, -1.0f, -3.0f);   // Bottom Right
+  glVertex3f(-1.0f, -1.0f, -3.0f);  // Bottom Left
+  glEnd();
+
+  glFinish();
+
+  clDepth_ = _wrapper->clCreateFromGLRenderbuffer(context_, CL_MEM_READ_WRITE,
+                                                  glDepthBuffer_, &error);
+  if (CL_SUCCESS != error) {
+    printf("clCreateFromGLRenderbuffer failed\n");
+    return false;
+  }
+
+  clOutputBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
+                                             bufferSize, NULL, &error);
+  if (CL_SUCCESS != error) return false;
+
+  clSampler_ = _wrapper->clCreateSampler(context_, CL_FALSE, CL_ADDRESS_NONE,
+                                         CL_FILTER_NEAREST, &error);
+  if (CL_SUCCESS != error) return false;
+
+  error = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 1,
+                                              &clDepth_, 0, NULL, NULL);
+
+  _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clOutputBuffer_);
+
+  _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), &clDepth_);
+
+  _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_sampler), &clSampler_);
+
+  _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, NULL,
+                                   dimSizes, NULL, 0, NULL, NULL);
+
+  _wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 1, &clDepth_, 0,
+                                      NULL, NULL);
+
+  _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], clOutputBuffer_, CL_TRUE,
+                                0, bufferSize, pCLOutput_, 0, NULL, NULL);
+
+  glReadPixels(0, 0, c_dimSize, c_dimSize, GL_DEPTH_COMPONENT, GL_FLOAT,
+               pGLOutput_);
+
+  // test that both resources are identical.
+  if (0 == memcmp(pGLOutput_, pCLOutput_, bufferSize)) {
+    retVal = true;  // test successful
+  } else {
+    printf("expected results is different from actual results\n");
+    dumpBuffer(pGLOutput_, "GLDepth.csv", c_dimSize);
+    dumpBuffer(pCLOutput_, "CLDepth.csv", c_dimSize);
+  }
+
+  return retVal;
+}
+
+unsigned int OCLGLDepthBuffer::close(void) {
+  if (pGLOutput_) {
+    free(pGLOutput_);
+    pGLOutput_ = NULL;
+  }
+
+  if (pCLOutput_) {
+    free(pCLOutput_);
+    pCLOutput_ = NULL;
+  }
+
+  clReleaseMemObject(clDepth_);
+  clReleaseMemObject(clOutputBuffer_);
+  clReleaseSampler(clSampler_);
+  // unbind the texture and frame buffer.
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, 0, 0);
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, 0, 0);
+  glBindFramebuffer(GL_FRAMEBUFFER, 0);
+  // clean gl resources
+  glDeleteFramebuffers(1, &frameBufferOBJ_);
+  frameBufferOBJ_ = 0;
+  glDeleteTextures(1, &colorBuffer_);
+  colorBuffer_ = 0;
+  glDeleteTextures(1, &glDepthBuffer_);
+  glDepthBuffer_ = 0;
+
+  return OCLGLCommon::close();
+}
+
+// helper functions
+unsigned int OCLGLDepthBuffer::formatToSize(GLint internalFormat) {
+  switch (internalFormat) {
+    case GL_DEPTH_COMPONENT32F:
+      return 4;
+      break;
+    case GL_DEPTH_COMPONENT16:
+      return 2;
+      break;
+    case GL_DEPTH24_STENCIL8:
+      return 4;
+      break;
+    case GL_DEPTH32F_STENCIL8:
+      return 8;
+      break;
+    default:
+      return 0;
+  }
+}
@@ -0,0 +1,66 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GL_DEPTH_BUFFER_H_
+#define _OCL_GL_DEPTH_BUFFER_H_
+
+#include "OCLGLCommon.h"
+
+class OCLGLDepthBuffer : public OCLGLCommon {
+ public:
+  OCLGLDepthBuffer();
+  virtual ~OCLGLDepthBuffer();
+  static const unsigned int c_dimSize = 128;
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  ////////////////////
+  // test functions //
+  ////////////////////
+  bool testDepthRead(GLint internalFormat, GLenum attachmentType);
+  unsigned int _currentTest;
+  /////////////////////
+  // private members //
+  /////////////////////
+  // GL resource identifiers
+  GLuint glDepthBuffer_;
+  GLuint frameBufferOBJ_;
+  GLuint colorBuffer_;
+
+  // CL identifiers
+  cl_mem clOutputBuffer_;
+  cl_mem clDepth_;
+  cl_sampler clSampler_;
+
+  // pointers to buffers
+  float* pGLOutput_;
+  float* pCLOutput_;
+  bool extensionSupported_;
+  //////////////////////////////
+  // private helper functions //
+  //////////////////////////////
+  // returns element size in bytes.
+  static unsigned int formatToSize(GLint internalFormat);
+};
+
+#endif  // _OCL_GL_BUFFER_H_
@@ -0,0 +1,278 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLDepthTex.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+const static char* strKernel =
+    "__kernel void gldepths_test( __global float *output, read_only image2d_t "
+    "source, sampler_t sampler){   \n"
+    "    int  tidX = get_global_id(0);\n"
+    "    int  tidY = get_global_id(1);\n"
+    "    float4 value = read_imagef( source, sampler, (int2)( tidX, tidY ) );\n"
+    "    output[ tidY * get_image_width( source ) + tidX ] =  value.z;\n"
+    "}\n";
+
+OCLGLDepthTex::OCLGLDepthTex()
+    : glDepthBuffer_(0),
+      frameBufferOBJ_(0),
+      colorBuffer_(0),
+      clOutputBuffer_(0),
+      clDepth_(0),
+      clSampler_(0),
+      pGLOutput_(0),
+      pCLOutput_(0),
+      extensionSupported_(false) {
+  _numSubTests = 8;
+  _currentTest = 0;
+}
+
+OCLGLDepthTex::~OCLGLDepthTex() {}
+
+void OCLGLDepthTex::open(unsigned int test, char* units, double& conversion,
+                         unsigned int deviceId) {
+  OCLGLCommon::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+
+  char* pExtensions = (char*)malloc(8192);
+  size_t returnSize;
+  _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 8192,
+                            pExtensions, &returnSize);
+
+  // if extension if not supported
+  if (!strstr(pExtensions, "cl_khr_gl_depth_images")) {
+    free(pExtensions);
+    printf("skipping test depth interop not supported\n");
+    return;
+  }
+  free(pExtensions);
+  extensionSupported_ = true;
+
+  static const char* OpenCL20Kernel = "-cl-std=CL2.0";
+  const char* options = OpenCL20Kernel;
+  if (test < 4) {
+    options = NULL;
+  }
+  _currentTest = test % 4;
+
+  // Build the kernel
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateProgramWithSource()  failed (%d)", error_);
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], options,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_);
+
+  kernel_ = _wrapper->clCreateKernel(program_, "gldepths_test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_);
+}
+
+void OCLGLDepthTex::run(void) {
+  if (_errorFlag || !extensionSupported_) {
+    return;
+  }
+  bool retVal;
+  switch (_currentTest) {
+    case 0:
+      retVal = testDepthRead(GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL,
+                             GL_UNSIGNED_INT_24_8);
+      break;
+    case 1:
+      retVal =
+          testDepthRead(GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_FLOAT);
+      break;
+    case 2:
+      retVal =
+          testDepthRead(GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT);
+      break;
+    case 3:
+      retVal = testDepthRead(GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL,
+                             GL_FLOAT_32_UNSIGNED_INT_24_8_REV);
+      break;
+    default:
+      CHECK_RESULT(true, "unsupported test number\n");
+  }
+  CHECK_RESULT((retVal != true), "cl-gl depth test failed ");
+}
+
+bool OCLGLDepthTex::testDepthRead(GLint internalFormat, GLenum format,
+                                  GLenum type) {
+  const unsigned int bufferSize = c_dimSize * c_dimSize * 4;
+
+  pGLOutput_ = (float*)malloc(bufferSize);
+  pCLOutput_ = (float*)malloc(bufferSize);
+  size_t dimSizes[] = {c_dimSize, c_dimSize};
+
+  bool retVal = false;
+  // create Frame buffer object
+  glGenFramebuffers(1, &frameBufferOBJ_);
+  glBindFramebuffer(GL_FRAMEBUFFER, frameBufferOBJ_);
+
+  // create   textures
+  glGenTextures(1, &colorBuffer_);
+  glBindTexture(GL_TEXTURE_2D, colorBuffer_);
+
+  glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, c_dimSize, c_dimSize, 0, GL_RGBA,
+               GL_UNSIGNED_BYTE, 0);
+
+  glGenTextures(1, &glDepthBuffer_);
+  glBindTexture(GL_TEXTURE_2D, glDepthBuffer_);
+  glTexImage2D(GL_TEXTURE_2D, 0, internalFormat, c_dimSize, c_dimSize, 0,
+               format, type, 0);
+  GLint glError = glGetError();
+  //
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, colorBuffer_, 0);
+
+  if (GL_DEPTH_COMPONENT == format) {
+    glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, glDepthBuffer_,
+                         0);
+  } else {
+    glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT,
+                         glDepthBuffer_, 0);
+  }
+
+  glBindFramebuffer(GL_FRAMEBUFFER, frameBufferOBJ_);
+
+  GLenum status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
+  if (GL_FRAMEBUFFER_COMPLETE != status) {
+    printf("frame buffer incomplete!\n");
+    return false;
+  }
+  // set up gl state machine
+  glViewport(0, 0, c_dimSize, c_dimSize);  // Reset The Current Viewport
+  glMatrixMode(GL_PROJECTION);             // Select The Projection Matrix
+  glLoadIdentity();                        // Reset The Projection Matrix
+  gluPerspective(30.0f, (GLfloat)c_dimSize / (GLfloat)c_dimSize, 0.1f, 100.0f);
+  glMatrixMode(GL_MODELVIEW);  // Select The Modelview Matrix
+  glLoadIdentity();
+  glEnable(GL_DEPTH_TEST);
+  glBindFramebuffer(GL_FRAMEBUFFER, frameBufferOBJ_);
+
+  cl_int error;
+
+  clOutputBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
+                                             bufferSize, NULL, &error);
+  if (CL_SUCCESS != error) return false;
+
+  clSampler_ = _wrapper->clCreateSampler(context_, CL_FALSE, CL_ADDRESS_NONE,
+                                         CL_FILTER_NEAREST, &error);
+  if (CL_SUCCESS != error) return false;
+
+  clDepth_ = _wrapper->clCreateFromGLTexture(
+      context_, CL_MEM_READ_ONLY, GL_TEXTURE_2D, 0, glDepthBuffer_, &error);
+  if (CL_SUCCESS != error) return false;
+
+  for (int i = 0; i < 3; ++i) {
+    // The Type Of Depth Testing To Do
+    glClear(GL_COLOR_BUFFER_BIT |
+            GL_DEPTH_BUFFER_BIT);  // Clear Screen And Depth Buffer
+
+    const float zValues[3][2] = {
+        {-6.f, -3.f},
+        {-5.f, -2.f},
+        {-4.f, -1.f},
+    };
+
+    glBegin(GL_QUADS);                        // Draw A Quad
+    glVertex3f(-1.0f, 1.0f, zValues[i][0]);   // Top Left
+    glVertex3f(1.0f, 1.0f, zValues[i][0]);    // Top Right
+    glVertex3f(1.0f, -1.0f, zValues[i][1]);   // Bottom Right
+    glVertex3f(-1.0f, -1.0f, zValues[i][1]);  // Bottom Left
+    glEnd();
+
+    glFinish();
+
+    error = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 1,
+                                                &clDepth_, 0, NULL, NULL);
+
+    _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clOutputBuffer_);
+
+    _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), &clDepth_);
+
+    _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_sampler), &clSampler_);
+
+    _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, NULL,
+                                     dimSizes, NULL, 0, NULL, NULL);
+
+    _wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 1, &clDepth_, 0,
+                                        NULL, NULL);
+
+    _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], clOutputBuffer_,
+                                  CL_TRUE, 0, bufferSize, pCLOutput_, 0, NULL,
+                                  NULL);
+
+    glReadPixels(0, 0, c_dimSize, c_dimSize, GL_DEPTH_COMPONENT, GL_FLOAT,
+                 pGLOutput_);
+
+    // test that both resources are identical.
+    if (0 == memcmp(pGLOutput_, pCLOutput_, bufferSize)) {
+      retVal = true;  // test successful
+    } else {
+      printf("expected results is different from actual results\n");
+      dumpBuffer(pGLOutput_, "GLDepth.csv", c_dimSize);
+      dumpBuffer(pCLOutput_, "clDepth_.csv", c_dimSize);
+    }
+  }
+
+  return retVal;
+}
+
+unsigned int OCLGLDepthTex::close(void) {
+  if (pGLOutput_) {
+    free(pGLOutput_);
+    pGLOutput_ = NULL;
+  }
+
+  if (pCLOutput_) {
+    free(pCLOutput_);
+    pCLOutput_ = NULL;
+  }
+
+  clReleaseMemObject(clDepth_);
+  clReleaseMemObject(clOutputBuffer_);
+  clReleaseSampler(clSampler_);
+  // unbind the texture and frame buffer.
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, 0, 0);
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, 0, 0);
+  glBindFramebuffer(GL_FRAMEBUFFER, 0);
+  // clean gl resources
+  glDeleteFramebuffers(1, &frameBufferOBJ_);
+  frameBufferOBJ_ = 0;
+  glDeleteTextures(1, &colorBuffer_);
+  colorBuffer_ = 0;
+  glDeleteTextures(1, &glDepthBuffer_);
+  glDepthBuffer_ = 0;
+
+  return OCLGLCommon::close();
+}
@@ -0,0 +1,62 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GL_DEPTH_TEX_H_
+#define _OCL_GL_DEPTH_TEX_H_
+
+#include "OCLGLCommon.h"
+
+class OCLGLDepthTex : public OCLGLCommon {
+ public:
+  OCLGLDepthTex();
+  virtual ~OCLGLDepthTex();
+  static const unsigned int c_dimSize = 128;
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  ////////////////////
+  // test functions //
+  ////////////////////
+  bool testDepthRead(GLint internalFormat, GLenum format, GLenum type);
+  unsigned int _currentTest;
+
+  /////////////////////
+  // private members //
+  /////////////////////
+  // GL resource identifiers
+  GLuint glDepthBuffer_;
+  GLuint frameBufferOBJ_;
+  GLuint colorBuffer_;
+
+  // CL identifiers
+  cl_mem clOutputBuffer_;
+  cl_mem clDepth_;
+  cl_sampler clSampler_;
+
+  // pointers to buffers
+  float* pGLOutput_;
+  float* pCLOutput_;
+  bool extensionSupported_;
+};
+
+#endif  // _OCL_GL_BUFFER_H_
@@ -0,0 +1,481 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLFenceSync.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+#include "Timer.h"
+#ifndef WIN_OS
+#include <GL/glx.h>
+#endif
+
+const static char *strKernel =
+    "__kernel void glmulticontext_test( __global uint4 *source, __global uint4 "
+    "*dest)   \n"
+    "{                                                                         "
+    "         \n"
+    "    int  tid = get_global_id(0);                                          "
+    "         \n"
+    "    dest[ tid ] = source [ tid ] + (uint4)(1);                            "
+    "         \n"
+    "}                                                                         "
+    "         \n";
+
+OCLGLFenceSync::OCLGLFenceSync() {
+  memset(contextData_, 0, sizeof(contextData_));
+  _numSubTests = 2;
+}
+
+OCLGLFenceSync::~OCLGLFenceSync() {}
+
+#ifdef WIN_OS
+typedef GLsync(__stdcall *glFenceSyncPtr)(GLenum condition, GLbitfield flags);
+typedef bool(__stdcall *glIsSyncPtr)(GLsync sync);
+typedef void(__stdcall *glDeleteSyncPtr)(GLsync sync);
+typedef GLenum(__stdcall *glClientWaitSyncPtr)(GLsync sync, GLbitfield flags,
+                                               GLuint64 timeout);
+typedef void(__stdcall *glWaitSyncPtr)(GLsync sync, GLbitfield flags,
+                                       GLuint64 timeout);
+typedef void(__stdcall *glGetInteger64vPtr)(GLenum pname, GLint64 *params);
+typedef void(__stdcall *glGetSyncivPtr)(GLsync sync, GLenum pname,
+                                        GLsizei bufSize, GLsizei *length,
+                                        GLint *values);
+#else
+typedef GLsync (*glFenceSyncPtr)(GLenum condition, GLbitfield flags);
+typedef bool (*glIsSyncPtr)(GLsync sync);
+typedef void (*glDeleteSyncPtr)(GLsync sync);
+typedef GLenum (*glClientWaitSyncPtr)(GLsync sync, GLbitfield flags,
+                                      GLuint64 timeout);
+typedef void (*glWaitSyncPtr)(GLsync sync, GLbitfield flags, GLuint64 timeout);
+typedef void (*glGetInteger64vPtr)(GLenum pname, GLint64 *params);
+typedef void (*glGetSyncivPtr)(GLsync sync, GLenum pname, GLsizei bufSize,
+                               GLsizei *length, GLint *values);
+#endif
+
+typedef struct __GLsync *GLsync;
+
+glFenceSyncPtr glFenceSyncFunc;
+
+glIsSyncPtr glIsSyncFunc;
+
+glDeleteSyncPtr glDeleteSyncFunc;
+
+glClientWaitSyncPtr glClientWaitSyncFunc;
+
+glWaitSyncPtr glWaitSyncFunc;
+
+glGetInteger64vPtr glGetInteger64vFunc;
+
+glGetSyncivPtr glGetSyncivFunc;
+
+#define CHK_GL_ERR() printf("%s\n", gluErrorString(glGetError()))
+
+#define cl_khr_gl_event 1
+
+static void InitSyncFns() {
+#ifdef WIN_OS
+  glFenceSyncFunc = (glFenceSyncPtr)wglGetProcAddress("glFenceSync");
+  glIsSyncFunc = (glIsSyncPtr)wglGetProcAddress("glIsSync");
+  glDeleteSyncFunc = (glDeleteSyncPtr)wglGetProcAddress("glDeleteSync");
+  glClientWaitSyncFunc =
+      (glClientWaitSyncPtr)wglGetProcAddress("glClientWaitSync");
+  glWaitSyncFunc = (glWaitSyncPtr)wglGetProcAddress("glWaitSync");
+  glGetInteger64vFunc =
+      (glGetInteger64vPtr)wglGetProcAddress("glGetInteger64v");
+  glGetSyncivFunc = (glGetSyncivPtr)wglGetProcAddress("glGetSynciv");
+#else
+  glFenceSyncFunc = (glFenceSyncPtr)glXGetProcAddress((GLubyte *)"glFenceSync");
+  glIsSyncFunc = (glIsSyncPtr)glXGetProcAddress((GLubyte *)"glIsSync");
+  glDeleteSyncFunc =
+      (glDeleteSyncPtr)glXGetProcAddress((GLubyte *)"glDeleteSync");
+  glClientWaitSyncFunc =
+      (glClientWaitSyncPtr)glXGetProcAddress((GLubyte *)"glClientWaitSync");
+  glWaitSyncFunc = (glWaitSyncPtr)glXGetProcAddress((GLubyte *)"glWaitSync");
+  glGetInteger64vFunc =
+      (glGetInteger64vPtr)glXGetProcAddress((GLubyte *)"glGetInteger64v");
+  glGetSyncivFunc = (glGetSyncivPtr)glXGetProcAddress((GLubyte *)"glGetSynciv");
+#endif
+}
+
+#define USING_ARB_sync 1
+
+typedef cl_event(CL_API_CALL *clCreateEventFromGLsyncKHR_fn)(
+    cl_context context, GLsync sync, cl_int *errCode_ret);
+
+clCreateEventFromGLsyncKHR_fn clCreateEventFromGLsyncKHR_ptr;
+
+/* Helper to determine if an extension is supported by a device */
+int is_extension_available(cl_device_id device, const char *extensionName) {
+  char *extString;
+  size_t size = 0;
+  int err;
+  int result = -1;
+
+  if ((err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &size))) {
+    printf(
+        "Error: failed to determine size of device extensions string (err = "
+        "%d)\n",
+        err);
+    return -2;
+  }
+
+  if (0 == size) return -3;
+
+  extString = (char *)malloc(size);
+  if (NULL == extString) {
+    printf(
+        "Error: unable to allocate %ld byte buffer for extension string (err = "
+        "%d)\n",
+        (long)size, err);
+    return -40;
+  }
+
+  if ((err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, size, extString,
+                             NULL))) {
+    printf("Error: failed to obtain device extensions string (err = %d)\n",
+           err);
+    free(extString);
+    return -5;
+  }
+
+  if (strstr(extString, extensionName)) result = 0;
+
+  free(extString);
+  return result;
+}
+
+void OCLGLFenceSync::open(unsigned int test, char *units, double &conversion,
+                          unsigned int deviceId) {
+  _openTest = test;
+
+  // Initialize random number seed
+  srand((unsigned int)time(NULL));
+
+  OCLGLCommon::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+
+  cl_context_properties properties[7] = {0};
+  for (unsigned int i = 0; i < c_glContextCount; i++) {
+    error_ = is_extension_available(devices_[_deviceId], "cl_khr_gl_event");
+    if (error_ != CL_SUCCESS) {
+      printf("Silent failure: cl_khr_gl_event extension not available (%d)\n",
+             error_);
+      extensionSupported_ = false;
+      return;
+    }
+    extensionSupported_ = true;
+
+    createGLContext(contextData_[i].glContext);
+    getCLContextPropertiesFromGLContext(contextData_[i].glContext, properties);
+
+    // Create new CL context from GL context
+    contextData_[i].clContext = _wrapper->clCreateContext(
+        properties, 1, &devices_[_deviceId], NULL, NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext() failed (%d)",
+                 error_);
+
+    // Create command queue for new context
+    contextData_[i].clCmdQueue = _wrapper->clCreateCommandQueue(
+        contextData_[i].clContext, devices_[_deviceId], 0, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed (%d)",
+                 error_);
+
+    // Build the kernel
+    contextData_[i].clProgram = _wrapper->clCreateProgramWithSource(
+        contextData_[i].clContext, 1, &strKernel, NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "clCreateProgramWithSource()  failed (%d)", error_);
+
+    error_ = _wrapper->clBuildProgram(contextData_[i].clProgram, 1,
+                                      &devices_[deviceId], NULL, NULL, NULL);
+    if (error_ != CL_SUCCESS) {
+      char programLog[1024];
+      _wrapper->clGetProgramBuildInfo(contextData_[i].clProgram,
+                                      devices_[deviceId], CL_PROGRAM_BUILD_LOG,
+                                      1024, programLog, 0);
+      printf("\n%s\n", programLog);
+      fflush(stdout);
+    }
+    CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)",
+                 error_);
+
+    contextData_[i].clKernel = _wrapper->clCreateKernel(
+        contextData_[i].clProgram, "glmulticontext_test", &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)",
+                 error_);
+  }
+}
+
+void OCLGLFenceSync::run() {
+  if (_errorFlag || !extensionSupported_) {
+    return;
+  }
+
+  CPerfCounter timer;
+  double sec;
+  float perf;
+  cl_uint4 inOutData[c_numOfElements] = {{{0}}};
+  cl_uint4 expectedData[c_numOfElements] = {{{0}}};
+  unsigned int m = sizeof(cl_uint4) / sizeof(cl_uint);
+  int count = 0;
+  // Initialize input data with random values
+  for (unsigned int i = 0; i < c_numOfElements; i++) {
+    for (unsigned int j = 0; j < m; j++) {
+      inOutData[i].s[j] = (unsigned int)i;
+      expectedData[i].s[j] = inOutData[i].s[j] + c_glContextCount;
+    }
+  }
+
+  cl_event fenceEvent0 = NULL, fenceEvent = NULL;
+  GLsync glFence0 = NULL, glFence = NULL;
+  InitSyncFns();
+
+  clCreateEventFromGLsyncKHR_ptr =
+      (clCreateEventFromGLsyncKHR_fn)clGetExtensionFunctionAddress(
+          "clCreateEventFromGLsyncKHR");
+  if (clCreateEventFromGLsyncKHR_ptr == NULL) {
+    printf(
+        "ERROR: Unable to run fence_sync test (clCreateEventFromGLsyncKHR "
+        "function not discovered!)\n");
+    return;
+  }
+
+  for (unsigned int i = 0; i < c_glContextCount; i++) {
+    makeCurrent(contextData_[i].glContext);
+
+    // Generate and Bind in & out OpenGL buffers
+    GLuint inGLBuffer = 0, outGLBuffer = 0;
+    glGenBuffers(1, &inGLBuffer);
+    glGenBuffers(1, &outGLBuffer);
+
+    glBindBuffer(GL_ARRAY_BUFFER, inGLBuffer);
+    glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), inOutData,
+                 GL_STATIC_DRAW);
+
+    glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer);
+    glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), NULL,
+                 GL_STATIC_DRAW);
+
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+
+    glFinish();
+
+    // Checking if clWaitForEvents works
+    switch (_openTest) {
+      case 0:  // Using fence sync
+        glFence0 = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+        CHECK_RESULT((glFence0 == NULL), "Unable to create GL fence");
+
+        fenceEvent0 = clCreateEventFromGLsyncKHR_ptr(contextData_[i].clContext,
+                                                     glFence0, &error_);
+        CHECK_RESULT((error_ != CL_SUCCESS),
+                     "Unable to create CL event from GL fence (%d)", error_);
+
+        error_ = clWaitForEvents(1, &fenceEvent0);
+        CHECK_RESULT((error_ != CL_SUCCESS), "clWaitForEvents() failed (%d)",
+                     error_);
+        break;
+      default:
+        glFinish();
+        break;
+    }
+
+    if (fenceEvent != NULL) {
+      clReleaseEvent(fenceEvent0);
+      glDeleteSync(glFence0);
+    }
+
+    cl_event acqEvent1 = 0, acqEvent2 = 0, kernelEvent = 0, relEvent1 = 0,
+             relEvent2 = 0;
+
+    // Create input buffer from GL input buffer
+    contextData_[i].inputBuffer = _wrapper->clCreateFromGLBuffer(
+        contextData_[i].clContext, CL_MEM_READ_ONLY, inGLBuffer, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "Unable to create input GL buffer (%d)", error_);
+
+    // Create output buffer from GL output buffer
+    contextData_[i].outputBuffer = _wrapper->clCreateFromGLBuffer(
+        contextData_[i].clContext, CL_MEM_WRITE_ONLY, outGLBuffer, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "Unable to create output GL buffer (%d)", error_);
+
+    timer.Reset();
+    switch (_openTest) {
+      case 0:  // Using fence sync
+        timer.Start();
+        glFence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+        timer.Stop();
+        CHECK_RESULT((glFence == NULL), "Unable to create GL fence");
+
+        timer.Start();
+        fenceEvent = clCreateEventFromGLsyncKHR_ptr(contextData_[i].clContext,
+                                                    glFence, &error_);
+        timer.Stop();
+        CHECK_RESULT((error_ != CL_SUCCESS),
+                     "Unable to create CL event from GL fence (%d)", error_);
+        break;
+      default:
+        break;
+    }
+
+    error_ =
+        _wrapper->clSetKernelArg(contextData_[i].clKernel, 0, sizeof(cl_mem),
+                                 &(contextData_[i].inputBuffer));
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
+                 error_);
+
+    error_ =
+        _wrapper->clSetKernelArg(contextData_[i].clKernel, 1, sizeof(cl_mem),
+                                 &(contextData_[i].outputBuffer));
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
+                 error_);
+
+    switch (_openTest) {
+      case 0:  // Using fence sync
+        timer.Start();
+        error_ = _wrapper->clEnqueueAcquireGLObjects(
+            contextData_[i].clCmdQueue, 1, &(contextData_[i].inputBuffer), 1,
+            &fenceEvent, &acqEvent1);
+        timer.Stop();
+        CHECK_RESULT((error_ != CL_SUCCESS),
+                     "Unable to acquire GL objects (%d)", error_);
+
+        timer.Start();
+        error_ = _wrapper->clEnqueueAcquireGLObjects(
+            contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 1,
+            &fenceEvent, &acqEvent2);
+        timer.Stop();
+        CHECK_RESULT((error_ != CL_SUCCESS),
+                     "Unable to acquire GL objects (%d)", error_);
+        break;
+      case 1:  // Using glFinish
+        timer.Start();
+        glFinish();
+        timer.Stop();
+
+        timer.Start();
+        error_ = _wrapper->clEnqueueAcquireGLObjects(
+            contextData_[i].clCmdQueue, 1, &(contextData_[i].inputBuffer), 0,
+            NULL, &acqEvent1);
+        timer.Stop();
+        CHECK_RESULT((error_ != CL_SUCCESS),
+                     "Unable to acquire GL objects (%d)", error_);
+
+        timer.Start();
+        error_ = _wrapper->clEnqueueAcquireGLObjects(
+            contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 0,
+            NULL, &acqEvent2);
+        timer.Stop();
+        CHECK_RESULT((error_ != CL_SUCCESS),
+                     "Unable to acquire GL objects (%d)", error_);
+        break;
+      default:
+        break;
+    }
+
+    size_t gws[1] = {c_numOfElements};
+    cl_event evts[2] = {acqEvent1, acqEvent2};
+    error_ = _wrapper->clEnqueueNDRangeKernel(contextData_[i].clCmdQueue,
+                                              contextData_[i].clKernel, 1, NULL,
+                                              gws, NULL, 2, evts, &kernelEvent);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed (%d)",
+                 error_);
+
+    error_ = _wrapper->clEnqueueReleaseGLObjects(contextData_[i].clCmdQueue, 1,
+                                                 &(contextData_[i].inputBuffer),
+                                                 1, &kernelEvent, &relEvent1);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "clEnqueueReleaseGLObjects failed (%d)", error_);
+
+    error_ = _wrapper->clEnqueueReleaseGLObjects(
+        contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 1,
+        &kernelEvent, &relEvent2);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "clEnqueueReleaseGLObjects failed (%d)", error_);
+
+    evts[0] = relEvent1;
+    evts[1] = relEvent2;
+    error_ = clWaitForEvents(2, evts);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clWaitForEvents() failed (%d)",
+                 error_);
+
+    glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer);
+    void *glMem = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY);
+    memcpy(inOutData, glMem, c_numOfElements * sizeof(cl_uint4));
+    glUnmapBuffer(GL_ARRAY_BUFFER);
+
+    _wrapper->clReleaseMemObject(contextData_[i].inputBuffer);
+    _wrapper->clReleaseMemObject(contextData_[i].outputBuffer);
+
+    // Delete GL buffers
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+    glDeleteBuffers(1, &inGLBuffer);
+    inGLBuffer = 0;
+    glDeleteBuffers(1, &outGLBuffer);
+    outGLBuffer = 0;
+  }
+
+  sec = timer.GetElapsedTime();
+  perf = (float)sec * 1000000;  // in microseconds
+  _perfInfo = (float)perf;
+
+  if (fenceEvent != NULL) {
+    clReleaseEvent(fenceEvent);
+    glDeleteSync(glFence);
+  }
+
+  // Compare expected output with actual data received
+  for (unsigned int i = 0; i < c_numOfElements; i++) {
+    for (unsigned int j = 0; j < m; j++) {
+      if (inOutData[i].s[j] != expectedData[i].s[j]) {
+        printf(
+            "Element %u is incorrect!\t expected:[ %u, %u, %u, %u ] differs "
+            "from actual:{%u, %u, %u, %u}\n",
+            i, expectedData[i].s[0], expectedData[i].s[1], expectedData[i].s[2],
+            expectedData[i].s[3], inOutData[i].s[0], inOutData[i].s[1],
+            inOutData[i].s[2], inOutData[i].s[3]);
+
+        count++;
+      }
+    }
+  }
+  if (count) printf("Number of elements wrong: %d\n", count);
+}
+
+unsigned int OCLGLFenceSync::close() {
+  error_ = is_extension_available(devices_[_deviceId], "cl_khr_gl_event");
+  if (error_ == CL_SUCCESS) {
+    for (unsigned int i = 0; i < c_glContextCount; i++) {
+      makeCurrent(contextData_[i].glContext);
+      _wrapper->clReleaseKernel(contextData_[i].clKernel);
+      _wrapper->clReleaseProgram(contextData_[i].clProgram);
+      _wrapper->clReleaseCommandQueue(contextData_[i].clCmdQueue);
+      _wrapper->clReleaseContext(contextData_[i].clContext);
+      destroyGLContext(contextData_[i].glContext);
+    }
+  }
+
+  return OCLGLCommon::close();
+}
@@ -0,0 +1,55 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GL_FENCE_SYNC_H_
+#define _OCL_GL_FENCE_SYNC_H_
+
+#include "OCLGLCommon.h"
+
+class OCLGLFenceSync : public OCLGLCommon {
+ public:
+  OCLGLFenceSync();
+  virtual ~OCLGLFenceSync();
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  static const unsigned int c_glContextCount = 1;
+  static const unsigned int c_numOfElements = 8192;
+
+  struct GLContextDataSet {
+    OCLGLHandle glContext;
+    cl_context clContext;
+    cl_command_queue clCmdQueue;
+    cl_program clProgram;
+    cl_kernel clKernel;
+    cl_mem inputBuffer;
+    cl_mem outputBuffer;
+  };
+  GLContextDataSet contextData_[c_glContextCount];
+
+  bool failed_;
+  bool extensionSupported_;
+};
+
+#endif  // _OCL_GL_FENCE_SYNC_H_
@@ -0,0 +1,298 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLMsaaTexture.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+const static char* strKernel =
+    "__kernel void gl_msaa_test( __global uint4 *output, read_only "
+    "image2d_msaa_t source, unsigned int numSamples){   \n"
+    "    int  tidX = get_global_id(0);\n"
+    "    int  tidY = get_global_id(1);\n"
+    "    for (int i = 0 ; i < numSamples ; i++) {\n"
+    "       uint4 value = read_imageui( source, (int2)( tidX, tidY ) ,i);\n"
+    "       int index = (tidY * get_image_width( source ) + tidX)*numSamples + "
+    "i;\n"
+    "       output[ index ] =  value;\n"
+    "   }\n"
+    "}\n";
+
+const static char* glDownSampleShader =
+    "uniform sampler2DMS MsaaTex;\n"
+    "uniform int numSamples;\n"
+    "uniform ivec2 resolution;\n"
+    "\n"
+    "varying vec4  gl_TexCoord[ ];  \n"
+    "\n"
+    "void main(void)\n"
+    "{\n"
+    "    vec4 accum = vec4(0.0,0.0,0.0,0.0);\n"
+    "    ivec2 coord = ivec2(resolution * gl_TexCoord[0].xy) ;\n"
+    "    for ( int i = 0 ; i < numSamples ; i++)\n"
+    "    {\n"
+    "        accum += texelFetch(MsaaTex,coord,i);\n"
+    "    }\n"
+    "    accum /= numSamples;\n"
+    "    \n"
+    "  \n"
+    "        \n"
+    "    gl_FragColor = accum;\n"
+    "}";
+
+OCLGLMsaaTexture::OCLGLMsaaTexture()
+    : msaaDepthBuffer_(0),
+      msaaFrameBufferOBJ_(0),
+      msaaColorBuffer_(0),
+      glShader_(0),
+      glprogram_(0),
+      clOutputBuffer_(0),
+      clMsaa_(0),
+      pGLOutput_(0),
+      pCLOutput_(0) {
+  _numSubTests = 1;
+  _currentTest = 0;
+}
+
+OCLGLMsaaTexture::~OCLGLMsaaTexture() {}
+
+void OCLGLMsaaTexture::open(unsigned int test, char* units, double& conversion,
+                            unsigned int deviceId) {
+  OCLGLCommon::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+
+  _currentTest = test;
+
+  // Build the kernel
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateProgramWithSource()  failed (%d)", error_);
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_);
+
+  kernel_ = _wrapper->clCreateKernel(program_, "gl_msaa_test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_);
+}
+
+void OCLGLMsaaTexture::run(void) {
+  if (_errorFlag) {
+    return;
+  }
+  bool retVal;
+  switch (_currentTest) {
+    case 0:
+      retVal = testMsaaRead(GL_RGBA, 2);
+      break;
+    default:
+      CHECK_RESULT(true, "unsupported test number\n");
+  }
+  CHECK_RESULT((retVal != true), "cl-gl depth test failed ");
+}
+
+unsigned int OCLGLMsaaTexture::close(void) {
+  if (pGLOutput_) {
+    free(pGLOutput_);
+    pGLOutput_ = NULL;
+  }
+
+  if (pCLOutput_) {
+    free(pCLOutput_);
+    pCLOutput_ = NULL;
+  }
+
+  clReleaseMemObject(clMsaa_);
+  clReleaseMemObject(clOutputBuffer_);
+
+  glFinish();
+  // unbind the texture and frame buffer.
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, 0, 0);
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, 0, 0);
+  glBindFramebuffer(GL_FRAMEBUFFER, 0);
+  glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, 0);
+
+  // clean gl resources
+  glDeleteFramebuffers(1, &msaaFrameBufferOBJ_);
+  msaaFrameBufferOBJ_ = 0;
+  glDeleteTextures(1, &msaaColorBuffer_);
+  msaaColorBuffer_ = 0;
+  glDeleteTextures(1, &msaaDepthBuffer_);
+  msaaDepthBuffer_ = 0;
+
+  glDeleteProgram(glprogram_);
+  glDeleteShader(glShader_);
+
+  return OCLGLCommon::close();
+}
+
+bool OCLGLMsaaTexture::testMsaaRead(GLint internalFormat,
+                                    unsigned int numSamples) {
+  size_t dimSizes[] = {c_dimSize, c_dimSize};
+
+  unsigned int bufferSize = c_dimSize * c_dimSize * 4;
+  bool retVal = false;
+  createGLFragmentProgramFromSource(glDownSampleShader, glShader_, glprogram_);
+
+  /////////////////////
+  // create msaa FBO //
+  /////////////////////
+  glGenFramebuffers(1, &msaaFrameBufferOBJ_);
+  glBindFramebuffer(GL_FRAMEBUFFER, msaaFrameBufferOBJ_);
+
+  // create   textures
+  glGenTextures(1, &msaaColorBuffer_);
+  glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, msaaColorBuffer_);
+  glTexImage2DMultisample(GL_TEXTURE_2D_MULTISAMPLE, numSamples, GL_RGBA8,
+                          c_dimSize, c_dimSize, GL_TRUE);
+
+  glGenTextures(1, &msaaDepthBuffer_);
+  glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, msaaDepthBuffer_);
+  glTexImage2DMultisample(GL_TEXTURE_2D_MULTISAMPLE, numSamples,
+                          GL_DEPTH_COMPONENT24, c_dimSize, c_dimSize, GL_TRUE);
+
+  //
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, msaaColorBuffer_,
+                       0);
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, msaaDepthBuffer_,
+                       0);
+
+  // verify all resource allocations are well.
+  GLenum status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
+  if (GL_FRAMEBUFFER_COMPLETE != status) {
+    return false;
+  }
+  // set up gl state machine
+  glViewport(0, 0, c_dimSize, c_dimSize);  // Reset The Current Viewport
+  glMatrixMode(GL_PROJECTION);             // Select The Projection Matrix
+  glLoadIdentity();                        // Reset The Projection Matrix
+  gluPerspective(30.0f, (GLfloat)c_dimSize / (GLfloat)c_dimSize, 0.1f, 100.0f);
+  glMatrixMode(GL_MODELVIEW);  // Select The Modelview Matrix
+  glLoadIdentity();
+  glEnable(GL_DEPTH_TEST);
+  // The Type Of Depth Testing To Do
+  glClear(GL_COLOR_BUFFER_BIT |
+          GL_DEPTH_BUFFER_BIT);     // Clear Screen And Depth Buffer
+  glBegin(GL_QUADS);                // Draw A Quad
+  glVertex3f(-1.0f, 1.0f, -6.0f);   // Top Left
+  glVertex3f(1.0f, 1.0f, -6.0f);    // Top Right
+  glVertex3f(1.0f, -1.0f, -3.0f);   // Bottom Right
+  glVertex3f(-1.0f, -1.0f, -3.0f);  // Bottom Left
+  glEnd();
+
+  glFinish();
+  cl_int error;
+  clOutputBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
+                                             bufferSize, NULL, &error);
+  if (CL_SUCCESS != error) return false;
+
+  clMsaa_ = _wrapper->clCreateFromGLTexture(context_, CL_MEM_READ_WRITE,
+                                            GL_TEXTURE_2D_MULTISAMPLE, 0,
+                                            msaaColorBuffer_, &error);
+  if (CL_SUCCESS != error) return false;
+
+  GLsizei samples;
+  error = _wrapper->clGetGLTextureInfo(clMsaa_, CL_GL_NUM_SAMPLES,
+                                       sizeof(samples), &samples, NULL);
+
+  error = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 1,
+                                              &clMsaa_, 0, NULL, NULL);
+  if (CL_SUCCESS != error) return false;
+
+  _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clOutputBuffer_);
+
+  _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), &clMsaa_);
+
+  _wrapper->clSetKernelArg(kernel_, 2, sizeof(unsigned int), &numSamples);
+
+  _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, NULL,
+                                   dimSizes, NULL, 0, NULL, NULL);
+
+  _wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 1, &clMsaa_, 0,
+                                      NULL, NULL);
+
+  pGLOutput_ = (unsigned int*)malloc(bufferSize);
+  pCLOutput_ = (unsigned int*)malloc(bufferSize);
+
+  _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], clOutputBuffer_, CL_TRUE,
+                                0, bufferSize, pCLOutput_, 0, NULL, NULL);
+
+  // down sample
+  glBindFramebuffer(GL_FRAMEBUFFER, 0);
+  glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, msaaColorBuffer_);
+  glUseProgram(glprogram_);
+
+  glUniform1i(glGetUniformLocation(glprogram_, "numSamples"), numSamples);
+  glUniform2i(glGetUniformLocation(glprogram_, "resolution"), c_dimSize,
+              c_dimSize);
+  glUniform1i(glGetUniformLocation(glprogram_, "MsaaTex"), 0);
+
+  // printOpenGLError();
+
+  glBegin(GL_QUADS);
+  glVertex2f(-1.0f, 1.0f);
+  glTexCoord2f(1.0f, 0.0f);
+  glVertex2f(1.0f, 1.0f);
+  glTexCoord2f(1.0f, 1.0f);
+  glVertex2f(1.0f, -1.0f);
+  glTexCoord2f(0.0f, 1.0f);
+  glVertex2f(-1.0f, -1.0f);
+  glTexCoord2f(0.0f, 0.0f);
+  glEnd();
+
+  glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, 0);
+  glUseProgram(0);
+
+  glReadPixels(0, 0, c_dimSize, c_dimSize, GL_BGRA, GL_UNSIGNED_BYTE,
+               pGLOutput_);
+
+  if (absDiff(pGLOutput_, pCLOutput_, c_dimSize)) retVal = true;
+
+  return retVal;
+}
+
+bool OCLGLMsaaTexture::absDiff(unsigned int* pGLBuffer, unsigned int* pCLBuffer,
+                               const unsigned int c_dimSize) {
+  bool retVal = true;
+  for (unsigned int i = 0; i < c_dimSize * c_dimSize; i++) {
+    char clPixel[4];
+    char glPixel[4];
+    char diff[4] = {0};
+    memcpy(clPixel, &(pCLBuffer[i]), sizeof(clPixel));
+    memcpy(glPixel, &(pGLBuffer[i]), sizeof(glPixel));
+
+    for (int j = 0; j < 4; j++) {
+      diff[j] = abs(clPixel[j] - glPixel[i]);
+      if (diff[j] > 10) retVal = false;
+    }
+  }
+  return retVal;
+}
@@ -0,0 +1,68 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GL_MSAA_TEXTURE_H_
+#define _OCL_GL_MSAA_TEXTURE_H_
+
+#include "OCLGLCommon.h"
+
+class OCLGLMsaaTexture : public OCLGLCommon {
+ public:
+  OCLGLMsaaTexture();
+  virtual ~OCLGLMsaaTexture();
+  static const unsigned int c_dimSize = 128;
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  ////////////////////
+  // test functions //
+  ////////////////////
+  bool testMsaaRead(GLint internalFormat, unsigned int NumSamples);
+  unsigned int _currentTest;
+
+  //////////////////////////////
+  // private helper functions //
+  //////////////////////////////
+
+  // returns element size in bytes.
+  static bool absDiff(unsigned int* pGLBuffer, unsigned int* pCLBuffer,
+                      const unsigned int dimSize);
+
+  /////////////////////
+  // private members //
+  /////////////////////
+  // GL resource identifiers
+  GLuint msaaDepthBuffer_;
+  GLuint msaaFrameBufferOBJ_;
+  GLuint msaaColorBuffer_;
+  GLuint glShader_;
+  GLuint glprogram_;
+  // CL identifiers
+  cl_mem clOutputBuffer_;
+  cl_mem clMsaa_;
+
+  unsigned int* pGLOutput_;
+  unsigned int* pCLOutput_;
+};
+
+#endif  // _OCL_GL_BUFFER_H_
@@ -0,0 +1,231 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLMultiContext.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+const static char* strKernel =
+    "__kernel void glmulticontext_test( __global uint4 *source, __global uint4 "
+    "*dest)   \n"
+    "{                                                                         "
+    "         \n"
+    "    int  tid = get_global_id(0);                                          "
+    "         \n"
+    "    dest[ tid ] = source[ tid ] + (uint4)(1);                             "
+    "         \n"
+    "}                                                                         "
+    "         \n";
+
+OCLGLMultiContext::OCLGLMultiContext() {
+  memset(contextData_, 0, sizeof(contextData_));
+  _numSubTests = 1;
+}
+
+OCLGLMultiContext::~OCLGLMultiContext() {}
+
+void OCLGLMultiContext::open(unsigned int test, char* units, double& conversion,
+                             unsigned int deviceId) {
+  // Initialize random number seed
+  srand((unsigned int)time(NULL));
+
+  OCLGLCommon::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+
+  cl_context_properties properties[7] = {0};
+  for (unsigned int i = 0; i < c_glContextCount; i++) {
+    createGLContext(contextData_[i].glContext);
+    getCLContextPropertiesFromGLContext(contextData_[i].glContext, properties);
+
+    // Create new CL context from GL context
+    contextData_[i].clContext = _wrapper->clCreateContext(
+        properties, 1, &devices_[_deviceId], NULL, NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext() failed (%d)",
+                 error_);
+
+    // Create command queue for new context
+    contextData_[i].clCmdQueue = _wrapper->clCreateCommandQueue(
+        contextData_[i].clContext, devices_[_deviceId], 0, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed (%d)",
+                 error_);
+
+    // Build the kernel
+    contextData_[i].clProgram = _wrapper->clCreateProgramWithSource(
+        contextData_[i].clContext, 1, &strKernel, NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "clCreateProgramWithSource()  failed (%d)", error_);
+
+    error_ = _wrapper->clBuildProgram(contextData_[i].clProgram, 1,
+                                      &devices_[deviceId], NULL, NULL, NULL);
+    if (error_ != CL_SUCCESS) {
+      char programLog[1024];
+      _wrapper->clGetProgramBuildInfo(contextData_[i].clProgram,
+                                      devices_[deviceId], CL_PROGRAM_BUILD_LOG,
+                                      1024, programLog, 0);
+      printf("\n%s\n", programLog);
+      fflush(stdout);
+    }
+    CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)",
+                 error_);
+
+    contextData_[i].clKernel = _wrapper->clCreateKernel(
+        contextData_[i].clProgram, "glmulticontext_test", &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)",
+                 error_);
+  }
+}
+
+void OCLGLMultiContext::run() {
+  if (_errorFlag) {
+    return;
+  }
+
+  cl_uint4 inOutData[c_numOfElements] = {{{0}}};
+  cl_uint4 expectedData[c_numOfElements] = {{{0}}};
+
+  // Initialize input data with random values
+  for (unsigned int i = 0; i < c_numOfElements; i++) {
+    for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) {
+      inOutData[i].s[j] = (unsigned int)rand();
+      expectedData[i].s[j] = inOutData[i].s[j] + c_glContextCount;
+    }
+  }
+
+  for (unsigned int i = 0; i < c_glContextCount; i++) {
+    makeCurrent(contextData_[i].glContext);
+
+    // Generate and Bind in & out OpenGL buffers
+    GLuint inGLBuffer = 0, outGLBuffer = 0;
+    glGenBuffers(1, &inGLBuffer);
+    glGenBuffers(1, &outGLBuffer);
+
+    glBindBuffer(GL_ARRAY_BUFFER, inGLBuffer);
+    glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), inOutData,
+                 GL_STATIC_DRAW);
+
+    glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer);
+    glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), NULL,
+                 GL_STATIC_DRAW);
+
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+    glFinish();
+
+    // Create input buffer from GL input buffer
+    contextData_[i].inputBuffer = _wrapper->clCreateFromGLBuffer(
+        contextData_[i].clContext, CL_MEM_READ_ONLY, inGLBuffer, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "Unable to create input GL buffer (%d)", error_);
+
+    // Create output buffer from GL output buffer
+    contextData_[i].outputBuffer = _wrapper->clCreateFromGLBuffer(
+        contextData_[i].clContext, CL_MEM_WRITE_ONLY, outGLBuffer, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "Unable to create output GL buffer (%d)", error_);
+
+    error_ =
+        _wrapper->clSetKernelArg(contextData_[i].clKernel, 0, sizeof(cl_mem),
+                                 &(contextData_[i].inputBuffer));
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
+                 error_);
+
+    error_ =
+        _wrapper->clSetKernelArg(contextData_[i].clKernel, 1, sizeof(cl_mem),
+                                 &(contextData_[i].outputBuffer));
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
+                 error_);
+
+    error_ = _wrapper->clEnqueueAcquireGLObjects(contextData_[i].clCmdQueue, 1,
+                                                 &(contextData_[i].inputBuffer),
+                                                 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)",
+                 error_);
+
+    error_ = _wrapper->clEnqueueAcquireGLObjects(
+        contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 0, NULL,
+        NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)",
+                 error_);
+
+    size_t gws[1] = {c_numOfElements};
+    error_ = _wrapper->clEnqueueNDRangeKernel(contextData_[i].clCmdQueue,
+                                              contextData_[i].clKernel, 1, NULL,
+                                              gws, NULL, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed (%d)",
+                 error_);
+
+    error_ = _wrapper->clEnqueueReleaseGLObjects(contextData_[i].clCmdQueue, 1,
+                                                 &(contextData_[i].inputBuffer),
+                                                 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "clEnqueueReleaseGLObjects failed (%d)", error_);
+
+    error_ = _wrapper->clEnqueueReleaseGLObjects(
+        contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 0, NULL,
+        NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "clEnqueueReleaseGLObjects failed (%d)", error_);
+
+    error_ = _wrapper->clFinish(contextData_[i].clCmdQueue);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clFinish() failed (%d)", error_);
+
+    glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer);
+    void* glMem = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY);
+    memcpy(inOutData, glMem, c_numOfElements * sizeof(cl_uint4));
+    glUnmapBuffer(GL_ARRAY_BUFFER);
+
+    _wrapper->clReleaseMemObject(contextData_[i].inputBuffer);
+    _wrapper->clReleaseMemObject(contextData_[i].outputBuffer);
+
+    // Delete GL buffers
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+    glDeleteBuffers(1, &inGLBuffer);
+    inGLBuffer = 0;
+    glDeleteBuffers(1, &outGLBuffer);
+    outGLBuffer = 0;
+  }
+
+  // Compare expected output with actual data received
+  for (unsigned int i = 0; i < c_numOfElements; i++) {
+    for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) {
+      CHECK_RESULT((inOutData[i].s[j] != expectedData[i].s[j]),
+                   "Element %d is incorrect!\n\t \
+                                                                       expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}",
+                   i, expectedData[i].s[0], expectedData[i].s[1],
+                   expectedData[i].s[2], expectedData[i].s[3],
+                   inOutData[i].s[0], inOutData[i].s[1], inOutData[i].s[2],
+                   inOutData[i].s[3]);
+    }
+  }
+}
+
+unsigned int OCLGLMultiContext::close() {
+  for (unsigned int i = 0; i < c_glContextCount; i++) {
+    makeCurrent(contextData_[i].glContext);
+    _wrapper->clReleaseKernel(contextData_[i].clKernel);
+    _wrapper->clReleaseProgram(contextData_[i].clProgram);
+    _wrapper->clReleaseCommandQueue(contextData_[i].clCmdQueue);
+    _wrapper->clReleaseContext(contextData_[i].clContext);
+    destroyGLContext(contextData_[i].glContext);
+  }
+  return OCLGLCommon::close();
+}
@@ -0,0 +1,54 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GL_MULTI_CONTEXT_H_
+#define _OCL_GL_MULTI_CONTEXT_H_
+
+#include "OCLGLCommon.h"
+
+class OCLGLMultiContext : public OCLGLCommon {
+ public:
+  OCLGLMultiContext();
+  virtual ~OCLGLMultiContext();
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  static const unsigned int c_glContextCount = 3;
+  static const unsigned int c_numOfElements = 128;
+
+  struct GLContextDataSet {
+    OCLGLHandle glContext;
+    cl_context clContext;
+    cl_command_queue clCmdQueue;
+    cl_program clProgram;
+    cl_kernel clKernel;
+    cl_mem inputBuffer;
+    cl_mem outputBuffer;
+  };
+  GLContextDataSet contextData_[c_glContextCount];
+
+  bool failed_;
+};
+
+#endif  // _OCL_GL_MULTI_CONTEXT_H_
@@ -0,0 +1,144 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLTexture.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+const static char* strKernelui =
+    "__kernel void gltexture_test(read_only image2d_t source, write_only "
+    "image2d_t dest)    \n"
+    "{                                                                         "
+    "             \n"
+    "    int  tidX = get_global_id(0);                                         "
+    "             \n"
+    "    int  tidY = get_global_id(1);                                         "
+    "             \n"
+    "    uint4 pixel = read_imageui(source, (int2)(tidX, tidY));               "
+    "             \n"
+    "    write_imageui(dest, (int2)(tidX, tidY), pixel);                       "
+    "             \n"
+    "}";
+
+const static char* strKernelf =
+    "__kernel void gltexture_test(read_only image2d_t source, write_only "
+    "image2d_t dest)    \n"
+    "{                                                                         "
+    "             \n"
+    "    int  tidX = get_global_id(0);                                         "
+    "             \n"
+    "    int  tidY = get_global_id(1);                                         "
+    "             \n"
+    "    float4 pixel = read_imagef(source, (int2)(tidX, tidY));               "
+    "             \n"
+    "    write_imagef(dest, (int2)(tidX, tidY), pixel);                        "
+    "            \n"
+    "}                                                                         "
+    "             \n";
+
+OCLGLTexture::OCLGLTexture()
+    : inDataGL_(NULL), outDataGL_(NULL), inGLTexture_(0), outGLTexture_(0) {
+  _numSubTests = 4 * 2;
+}
+
+OCLGLTexture::~OCLGLTexture() {}
+
+void OCLGLTexture::open(unsigned int test, char* units, double& conversion,
+                        unsigned int deviceId) {
+  // Initialize random number seed
+  srand((unsigned int)time(NULL));
+
+  OCLGLCommon::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+
+  currentTest_ = test % 4;
+  testRender_ = ((test / 4) >= 1) ? true : false;
+
+  // Build the kernel
+  if (0 == currentTest_) {
+    program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernelui,
+                                                   NULL, &error_);
+
+  } else {
+    program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernelf,
+                                                   NULL, &error_);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateProgramWithSource()  failed (%d)", error_);
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_);
+
+  kernel_ = _wrapper->clCreateKernel(program_, "gltexture_test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_);
+}
+
+void OCLGLTexture::run(void) {
+  bool retVal = false;
+  switch (currentTest_) {
+    case 0:
+      retVal = runTextureTest<unsigned int>(GL_RGBA32UI, GL_RGBA_INTEGER,
+                                            GL_UNSIGNED_INT);
+      break;
+    case 1:
+      retVal =
+          runTextureTest<unsigned char>(GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE);
+      break;
+    case 2:
+      retVal = runTextureTest<short>(GL_RGBA16, GL_RGBA, GL_SHORT);
+      break;
+    case 3:
+      retVal = runTextureTest<float>(GL_RGBA32F, GL_RGBA, GL_FLOAT);
+      break;
+    default:
+      CHECK_RESULT(true, "unsupported test number\n");
+  }
+  CHECK_RESULT((retVal != true), "cl-gl texture interop test failed ");
+}
+
+unsigned int OCLGLTexture::close(void) {
+  clReleaseMemObject(buffers_[0]);
+  clReleaseMemObject(buffers_[1]);
+  buffers_.clear();
+  // Delete GL in & out buffers
+  glFinish();
+  glBindTexture(GL_TEXTURE_2D, 0);
+  glDeleteTextures(1, &inGLTexture_);
+  inGLTexture_ = 0;
+  glDeleteTextures(1, &outGLTexture_);
+  outGLTexture_ = 0;
+
+  free(inDataGL_);
+  inDataGL_ = NULL;
+  free(outDataGL_);
+  outDataGL_ = NULL;
+  return OCLGLCommon::close();
+}
@@ -0,0 +1,214 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GL_TEXTURE_H_
+#define _OCL_GL_TEXTURE_H_
+
+#include <iostream>
+
+#include "OCLGLCommon.h"
+
+class OCLGLTexture : public OCLGLCommon {
+ public:
+  static const unsigned int c_imageWidth = 512;
+  static const unsigned int c_imageHeight = 512;
+  static const unsigned int c_elementsPerPixel = 4;
+
+  OCLGLTexture();
+  virtual ~OCLGLTexture();
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  unsigned int currentTest_;
+  void* inDataGL_;
+  void* outDataGL_;
+  GLuint inGLTexture_;
+  GLuint outGLTexture_;
+  bool testRender_;
+  template <typename T>
+  bool runTextureTest(GLint internalFormat, GLenum format, GLenum type);
+};
+
+template <typename T>
+bool OCLGLTexture::runTextureTest(GLint internalFormat, GLenum format,
+                                  GLenum type) {
+  cl_mem image;
+  inDataGL_ =
+      malloc(c_imageWidth * c_imageHeight * c_elementsPerPixel * sizeof(T));
+  outDataGL_ =
+      malloc(c_imageWidth * c_imageHeight * c_elementsPerPixel * sizeof(T));
+
+  // Initialize input data with random values
+  T* inputIterator = (T*)inDataGL_;
+  for (unsigned int i = 0;
+       i < c_imageWidth * c_imageHeight * c_elementsPerPixel; i++) {
+    inputIterator[i] = (T)(rand() % 255);
+  }
+  // Initialize output data with zeros
+  memset(outDataGL_, 0,
+         c_imageWidth * c_imageHeight * c_elementsPerPixel * sizeof(T));
+
+  // Generate and Bind in & out OpenGL textures
+  glGenTextures(1, &inGLTexture_);
+  glGenTextures(1, &outGLTexture_);
+
+  glBindTexture(GL_TEXTURE_2D, inGLTexture_);
+  glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+  glTexImage2D(GL_TEXTURE_2D, 0, internalFormat, (GLsizei)c_imageWidth,
+               (GLsizei)c_imageHeight, 0, format, type, inDataGL_);
+
+  glBindTexture(GL_TEXTURE_2D, outGLTexture_);
+  glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+  glTexImage2D(GL_TEXTURE_2D, 0, internalFormat, (GLsizei)c_imageWidth,
+               (GLsizei)c_imageHeight, 0, format, type, outDataGL_);
+
+  glFinish();
+
+  // Create input buffer from GL input texture
+  image = _wrapper->clCreateFromGLTexture(
+      context_, CL_MEM_READ_ONLY, GL_TEXTURE_2D, 0, inGLTexture_, &error_);
+  if (error_ != CL_SUCCESS) {
+    printf("Unable to create input buffer from GL texture (%d)", error_);
+    return false;
+  }
+  buffers_.push_back(image);
+
+  // Create output buffer from GL output texture
+  image = _wrapper->clCreateFromGLTexture(
+      context_, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, outGLTexture_, &error_);
+  if (error_ != CL_SUCCESS) {
+    printf("Unable to create output buffer from GL texture (%d)", error_);
+    return false;
+  }
+  buffers_.push_back(image);
+  size_t gws[2] = {c_imageWidth, c_imageHeight};
+
+  // Assign args
+  for (unsigned int i = 0; i < buffers_.size(); i++) {
+    error_ =
+        _wrapper->clSetKernelArg(kernel_, i, sizeof(cl_mem), &buffers()[i]);
+    if (error_ != CL_SUCCESS) {
+      printf("clSetKernelArg() failed (%d)", error_);
+      return false;
+    }
+  }
+
+  int loop = (testRender_) ? 2 : 1;
+  for (int l = 0; l < loop; ++l) {
+    if (testRender_ && (l == 0)) {
+      GLuint FrameBufferName = 0;
+      glGenFramebuffers(1, &FrameBufferName);
+      glBindFramebuffer(GL_FRAMEBUFFER, FrameBufferName);
+      glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, inGLTexture_,
+                           0);
+      glClearColor(.5f, 1.f, 1.0f, 0);
+      glClear(GL_COLOR_BUFFER_BIT);
+      glFinish();
+    }
+
+    error_ = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 2,
+                                                 &buffers()[0], 0, NULL, NULL);
+    if (error_ != CL_SUCCESS) {
+      printf("Unable to acquire GL objects (%d)", error_);
+      return false;
+    }
+
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2,
+                                              NULL, gws, NULL, 0, NULL, NULL);
+    if (error_ != CL_SUCCESS) {
+      printf("clEnqueueNDRangeKernel() failed (%d)", error_);
+      return false;
+    }
+
+    error_ = _wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 2,
+                                                 &buffers()[0], 0, NULL, NULL);
+    if (error_ != CL_SUCCESS) {
+      printf("clEnqueueReleaseGLObjects failed (%d)", error_);
+      return false;
+    }
+
+    error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
+    if (error_ != CL_SUCCESS) {
+      printf("clFinish() failed (%d)", error_);
+      return false;
+    }
+
+    if (testRender_ && (l == 0)) {
+      glClearColor(1.f, 1.f, 1.f, 1.f);
+      glClear(GL_COLOR_BUFFER_BIT);
+      glFinish();
+    }
+  }
+
+  // Get the results from GL texture
+  glBindTexture(GL_TEXTURE_2D, outGLTexture_);
+  glActiveTexture(GL_TEXTURE0);
+  glGetTexImage(GL_TEXTURE_2D, 0, format, type, outDataGL_);
+
+  // Check output texture data
+  inputIterator = (T*)inDataGL_;
+  T* outputIterator = (T*)outDataGL_;
+  T color;
+  switch (type) {
+    case GL_UNSIGNED_INT:
+      color = (T)0x3f800000;
+      break;
+    case GL_UNSIGNED_BYTE:
+      color = (T)0xff;
+      break;
+    case GL_SHORT:
+      color = (T)0x7fff;
+      break;
+    case GL_FLOAT:
+      color = (T)1.f;
+      break;
+    default:
+      return false;
+  }
+  for (unsigned int i = 0;
+       i < c_imageWidth * c_imageHeight * c_elementsPerPixel; i++) {
+    if (testRender_) {
+      if (outputIterator[i] != color) {
+        std::cout << "Element " << i
+                  << " in output texture is incorrect! (internal format = "
+                  << internalFormat << "\n\t expected:" << inputIterator[i]
+                  << " differs from actual clear color:" << color << std::endl;
+        return false;
+      }
+    } else if (inputIterator[i] != outputIterator[i]) {
+      std::cout << "Element " << i
+                << " in output texture is incorrect! (internal format = "
+                << internalFormat << "\n\t expected:" << inputIterator[i]
+                << " differs from actual: " << outputIterator[i] << std::endl;
+      return false;
+    }
+  }
+  return true;
+}
+
+#endif  // _OCL_GL_TEXTURE_H_
@@ -0,0 +1,54 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLTestListImp.h"
+
+//
+// Includes for tests
+//
+#include "OCLGLBuffer.h"
+#include "OCLGLBufferMultipleQueues.h"
+#include "OCLGLDepthBuffer.h"
+#include "OCLGLDepthTex.h"
+#include "OCLGLFenceSync.h"
+#include "OCLGLMsaaTexture.h"
+#include "OCLGLMultiContext.h"
+#include "OCLGLTexture.h"
+
+//
+//  Helper macro for adding tests
+//
+template <typename T>
+static void* dictionary_CreateTestFunc(void) {
+  return new T();
+}
+
+#define TEST(name) \
+  { #name, &dictionary_CreateTestFunc < name> }
+
+TestEntry TestList[] = {
+    TEST(OCLGLBuffer),    TEST(OCLGLBufferMultipleQueues),
+    TEST(OCLGLTexture),   TEST(OCLGLMultiContext),
+    TEST(OCLGLFenceSync), TEST(OCLGLDepthTex),
+};
+
+unsigned int TestListCount = sizeof(TestList) / sizeof(TestList[0]);
+unsigned int TestLibVersion = 0;
+const char* TestLibName = "oclgl";
@@ -0,0 +1 @@
+# all clear
@@ -0,0 +1,206 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _BaseTestImp_H_
+#define _BaseTestImp_H_
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <vector>
+
+#include "OCLTest.h"
+#include "OCLWrapper.h"
+
+#define EXIT_SILENT_FAILURE 2
+#define KERNEL(...) #__VA_ARGS__
+
+#ifdef _MSC_VER
+#define snprintf sprintf_s
+#endif
+
+#define CHECK_ERROR(error, msg)                       \
+  if (error != CL_SUCCESS) {                          \
+    _errorFlag = true;                                \
+    printf("\n\n%s\nError code: %d\n\n", msg, error); \
+    _errorMsg = msg;                                  \
+    _crcword += 1;                                    \
+    return;                                           \
+  }
+
+#define CHECK_ERROR_NO_RETURN(error, msg)             \
+  if (error != CL_SUCCESS) {                          \
+    _errorFlag = true;                                \
+    printf("\n\n%s\nError code: %d\n\n", msg, error); \
+    _errorMsg = msg;                                  \
+    _crcword += 1;                                    \
+  }
+
+#define CHECK_RESULT(test, msg, ...)                  \
+  if ((test)) {                                       \
+    char* buf = (char*)malloc(4096);                  \
+    _errorFlag = true;                                \
+    int rc = snprintf(buf, 4096, msg, ##__VA_ARGS__); \
+    assert(rc >= 0 && rc < (int)4096);                \
+    printf("%s:%d - %s\n", __FILE__, __LINE__, buf);  \
+    _errorMsg = std::string(buf);                     \
+    _crcword += 1;                                    \
+    free(buf);                                        \
+    return;                                           \
+  }
+
+#define CHECK_RESULT_ARGS CHECK_RESULT
+
+#define CHECK_RESULT_NO_RETURN(test, msg, ...)        \
+  if ((test)) {                                       \
+    char* buf = (char*)malloc(4096);                  \
+    _errorFlag = true;                                \
+    int rc = snprintf(buf, 4096, msg, ##__VA_ARGS__); \
+    assert(rc >= 0 && rc < (int)4096);                \
+    printf("%s:%d - %s\n", __FILE__, __LINE__, buf);  \
+    _errorMsg = std::string(msg);                     \
+    _crcword += 1;                                    \
+    free(buf);                                        \
+  }
+
+#define CHECK_RESULT_NO_RETURN_ARGS CHECK_RESULT_NO_RETURN
+
+#define CHECK_RESULT_SHUTDOWN(test, msg) \
+  if ((test)) {                          \
+    _errorFlag = true;                   \
+    printf("%s\n", msg);                 \
+    _errorMsg = msg;                     \
+    _crcword += 1;                       \
+    close();                             \
+    return;                              \
+  }
+
+#define CHECK_RESULT_CL(test, msg) \
+  if ((test)) {                    \
+    _errorFlag = true;             \
+    printf("%s\n", msg);           \
+    _errorMsg = msg;               \
+    _crcword += 1;                 \
+    return 1;                      \
+  }
+
+class BaseTestImp : public OCLTest {
+ public:
+  BaseTestImp();
+  virtual ~BaseTestImp();
+
+ public:
+  virtual unsigned int getThreadUsage(void);
+  virtual int getNumSubTests(void);
+
+  //! Abstract functions being defined here
+  virtual void open();
+  virtual void open(unsigned int test, const char* deviceName,
+                    unsigned int architecture);
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId, unsigned int platformIndex) {
+    return open(test, "Tahiti", platformIndex);
+  }
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId) {
+    return open(test, "Tahiti", 0);
+  }
+
+  virtual void run(void) = 0;
+  virtual unsigned int close(void);
+
+  //! Functions to set class members
+  virtual void checkComplib(unsigned int test, const char* deviceName,
+                            unsigned int architecture);
+  virtual void setDeviceName(const char*);
+  virtual const char* getDeviceName();
+  virtual void setErrorMsg(const char* error);
+  virtual const char* getErrorMsg(void);
+  virtual bool hasErrorOccured(void);
+  virtual void clearError();
+  BaseTestImp* toBaseTestImp() { return this; }
+  virtual OCLTestImp* toOCLTestImp() { return NULL; }
+  virtual void useCPU() { _cpu = true; }
+  virtual void setIterationCount(int cnt);
+  virtual void setDeviceId(unsigned int deviceId);
+  virtual unsigned int getDeviceId();
+  virtual void setPlatformIndex(unsigned int platformIndex);
+  virtual unsigned int getPlatformIndex();
+  virtual float getPerfInfo();
+  virtual void clearPerfInfo();
+
+ protected:
+  unsigned int _numSubTests;
+  unsigned int _openTest;
+  unsigned int _useThreads;
+  int _iterationCnt;
+  float _perfInfo;
+  bool _cpu;
+
+  unsigned int _crcword;
+  unsigned int _crctab[256];
+
+  bool _errorFlag;
+  std::string _errorMsg;
+
+  const char* _deviceName;
+  unsigned int _architecture;
+  unsigned int _deviceId;
+  unsigned int _platformIndex;
+  bool failed_ = false;
+  cl_int error_;
+  cl_uint type_;
+  cl_uint deviceCount_;
+  cl_device_id* devices_;
+  cl_context context_;
+
+  cl_program program_;
+  cl_kernel kernel_;
+};
+
+// enum to keep track of different memory types
+enum MemType { LOOCL, REMOTE_CACHED, REMOTE_UNCACHED };
+
+class DataType {
+  cl_image_format f;
+  const char* str;
+  unsigned int size;
+
+ public:
+  DataType() {}
+
+  DataType(cl_image_format f, const char* str, unsigned int size) {
+    this->f = f;
+    this->str = str;
+    this->size = size;
+  }
+  operator const char*() { return str; }
+
+  operator unsigned int() { return size; }
+  operator cl_image_format() { return f; }
+};
+
+// useful for initialization of an array of data types for a test
+#define DTYPE(x, y) DataType(x, #x, (unsigned int)y)
+
+#endif
@@ -0,0 +1,83 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCLTestImp_H_
+#define _OCLTestImp_H_
+
+#include <string>
+#include <vector>
+
+#include "BaseTestImp.h"
+#include "CL/cl.h"
+#include "OCL/Thread.h"
+#include "OCLTest.h"
+#include "OCLWrapper.h"
+
+class OCLTestImp : public BaseTestImp {
+ public:
+  OCLTestImp();
+  virtual ~OCLTestImp();
+
+ public:
+  //! Abstract functions being defined here
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId, unsigned int platformIndex);
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void) = 0;
+  virtual unsigned int close(void);
+  //! Functions to set class members
+
+ public:
+  void useCPU();
+  int genIntRand(int a, int b);
+  int genBitRand(int n);
+  void accumulateCRC(const void* buffer, int len);
+  void setOCLWrapper(OCLWrapper* wrapper);
+  OCLTestImp* toOCLTestImp() { return this; }
+
+  static OCLutil::Lock openDeviceLock;
+  static OCLutil::Lock compileLock;
+
+ protected:
+  const std::vector<cl_mem>& buffers() const { return buffers_; }
+
+  OCLWrapper* _wrapper;
+
+  int _seed;
+
+  // Common data of any CL program
+  cl_int error_;
+  cl_uint type_;
+  cl_uint deviceCount_;
+  cl_device_id* devices_;
+  cl_platform_id platform_;
+  std::vector<cl_command_queue> cmdQueues_;
+  cl_context context_;
+
+  cl_program program_;
+  cl_kernel kernel_;
+  std::vector<cl_mem> buffers_;
+};
+
+// useful for initialization of an array of data types for a test
+#define DTYPE(x, y) DataType(x, #x, (unsigned int)y)
+
+#endif
@@ -0,0 +1,86 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef __Dictionary_h__
+#define __Dictionary_h__
+
+//
+// Testing module (plugin) interface forward declarations
+//
+#ifdef ATI_OS_WIN
+#define OCL_DLLEXPORT __declspec(dllexport)
+#define OCL_CALLCONV __cdecl
+#endif
+#ifdef ATI_OS_LINUX
+#define OCL_DLLEXPORT
+#define OCL_CALLCONV
+#endif
+
+class OCLTest;
+
+//
+//  OCLTestList_TestCount - retrieve the number of tests in the testing module
+//
+extern "C" OCL_DLLEXPORT unsigned int OCL_CALLCONV OCLTestList_TestCount(void);
+
+//
+//  OCLTestList_TestLibVersion - retrieve the version of test lib in the testing
+//  module
+//
+extern "C" OCL_DLLEXPORT unsigned int OCL_CALLCONV
+OCLTestList_TestLibVersion(void);
+
+//
+//  OCLTestList_TestLibName - retrieve the name of test library
+//
+extern "C" OCL_DLLEXPORT const char* OCL_CALLCONV OCLTestList_TestLibName(void);
+
+//
+//  OCLTestList_TestName - retrieve the name of the indexed test in the module
+//
+extern "C" OCL_DLLEXPORT const char* OCL_CALLCONV
+OCLTestList_TestName(unsigned int testNum);
+
+//
+//  OCLTestList_CreateTest - create a test by index
+//
+extern "C" OCL_DLLEXPORT OCLTest* OCL_CALLCONV
+OCLTestList_CreateTest(unsigned int testNum);
+
+//
+//  OCLTestList_DestroyTest - destroy a test object
+//
+extern "C" OCL_DLLEXPORT void OCL_CALLCONV
+OCLTestList_DestroyTest(OCLTest* test);
+
+//
+//  internal global data that is populated in each dll
+//
+typedef struct _TestEntry {
+  const char* name;
+  void* (*create)(void);
+} TestEntry;
+
+extern TestEntry TestList[];
+extern unsigned int TestListCount;
+extern unsigned int TestLibVersion;
+extern const char* TestLibName;
+
+#endif
@@ -0,0 +1,32 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_INCLUDES_H
+#define _OCL_INCLUDES_H
+
+#ifdef ATI_OS_WIN
+#define POINTER_64 __ptr64
+#include <windows.h>
+#include "d3d9.h"
+#endif
+
+#include "CL/cl.h"
+
+#endif  //_OCL_INCLUDES_H
@@ -0,0 +1,211 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerf3DImageWriteSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define KERNEL_CODE(...) #__VA_ARGS__
+
+#define NUM_SIZES 4
+static const unsigned int Sizes[NUM_SIZES] = {64, 128, 256, 512};
+
+#define NUM_FORMATS 1
+static const cl_image_format formats[NUM_FORMATS] = {
+    {CL_RGBA, CL_UNSIGNED_INT8}};
+static const char *textFormats[NUM_FORMATS] = {"CL_RGBA , CL_UNSIGNED_INT8"};
+static const unsigned int formatSize[NUM_FORMATS] = {sizeof(CL_UNSIGNED_INT8)};
+
+const static char *strKernel = {KERNEL_CODE(
+  \n __kernel void image_kernel(write_only image3d_t input) {
+  size_t x = get_global_id(0);
+  size_t y = get_global_id(1);
+  size_t z = get_global_id(2);
+
+  int4 coords = (int4)(x, y, z, 0);
+  write_imageui(input, coords, (1, 1, 1, 1));
+}
+  \n)};
+
+OCLPerf3DImageWriteSpeed::OCLPerf3DImageWriteSpeed() {
+  _numSubTests = NUM_SIZES * NUM_FORMATS;
+}
+
+OCLPerf3DImageWriteSpeed::~OCLPerf3DImageWriteSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerf3DImageWriteSpeed::open(unsigned int test, char *units,
+                                    double &conversion, unsigned int deviceId) {
+  error_ = CL_SUCCESS;
+  testId_ = test;
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  program_ = 0;
+  kernel_ = 0;
+  cmd_queue_ = 0;
+  imageBuffer_ = 0;
+  skip_ = false;
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_EXTENSIONS,
+                                     1024, charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  if (!strstr(charbuf, "cl_khr_3d_image_writes")) {
+    skip_ = true;
+    testDescString = "3D Write not supported. Test Skipped.";
+    return;
+  }
+
+  bufSize_ = Sizes[test % NUM_SIZES];
+  bufnum_ = (test / NUM_SIZES) % NUM_FORMATS;
+  memSize_ = bufSize_ * bufSize_ * bufSize_ * formatSize[bufnum_];
+
+  cmd_queue_ = cmdQueues_[_deviceId];
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "image_kernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  imageBuffer_ = _wrapper->clCreateImage3D(
+      context_, CL_MEM_WRITE_ONLY, &formats[bufnum_], bufSize_, bufSize_,
+      bufSize_, 0, 0, NULL, &error_);
+  CHECK_RESULT(imageBuffer_ == 0, "clCreateImage(imageBuffer_) failed");
+
+  // set kernel arguments
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &imageBuffer_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+}
+
+void OCLPerf3DImageWriteSpeed::run(void) {
+  if (skip_) {
+    return;
+  }
+
+  CPerfCounter timer;
+  unsigned int fmt_num = (testId_ / NUM_SIZES) % NUM_FORMATS;
+
+  size_t gws[3] = {bufSize_, bufSize_, bufSize_};
+  size_t lws[3] = {8, 8, 4};
+
+  // warm up
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, kernel_, 3, NULL, gws,
+                                            lws, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  _wrapper->clFinish(cmd_queue_);
+
+  // checkData
+  char *bufptr = (char *)malloc(memSize_);
+
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {bufSize_, bufSize_, bufSize_};
+  size_t image_row_pitch = bufSize_ * formatSize[bufnum_];
+  size_t image_slice_pitch = image_row_pitch * bufSize_;
+  error_ = clEnqueueReadImage(cmd_queue_, imageBuffer_, true, origin, region,
+                              image_row_pitch, image_slice_pitch, bufptr, 0,
+                              NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadImage() failed");
+
+  for (size_t i = 0; i < bufSize_ * bufSize_ * bufSize_ * 4; ++i) {
+    if (bufptr[i] != 1) {
+      printf("(%4dx%4dx%4d) fmt:%s(%1u) checkData() fail, image_ptr[%u] = %d\n",
+             bufSize_, bufSize_, bufSize_, textFormats[fmt_num],
+             formatSize[bufnum_], (unsigned int)i, (int)bufptr[i]);
+      CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
+      char buf[256];
+      SNPRINTF(buf, sizeof(buf),
+               " (%4dx%4dx%4d) fmt:%s(%1d) checkData() FAILED! ", bufSize_,
+               bufSize_, bufSize_, textFormats[fmt_num], formatSize[bufnum_]);
+      testDescString = buf;
+      return;
+    }
+  }
+  delete bufptr;
+
+  // test begins
+  unsigned int numIter = 5;
+
+  timer.Reset();
+  timer.Start();
+
+  for (unsigned int i = 0; i < numIter; ++i) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, kernel_, 3, NULL, gws,
+                                              lws, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+    _wrapper->clFinish(cmd_queue_);
+  }
+
+  timer.Stop();
+
+  double sec = timer.GetElapsedTime();
+
+  // write_image speed in GB/s
+  double perf = ((double)memSize_ * numIter * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%3dx%3dx%3d) fmt:%s(%1u) i: %2d (GB/s) ",
+           bufSize_, bufSize_, bufSize_, textFormats[fmt_num],
+           formatSize[bufnum_], numIter);
+  testDescString = buf;
+}
+
+unsigned int OCLPerf3DImageWriteSpeed::close(void) {
+  if (!skip_) {
+    if (imageBuffer_) {
+      error_ = _wrapper->clReleaseMemObject(imageBuffer_);
+      CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                             "clReleaseMemObject(imageBuffer_) failed");
+    }
+  }
+  return OCLTestImp::close();
+}
@@ -0,0 +1,49 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_3DImageWriteSpeed_H_
+#define _OCL_3DImageWriteSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerf3DImageWriteSpeed : public OCLTestImp {
+ public:
+  OCLPerf3DImageWriteSpeed();
+  virtual ~OCLPerf3DImageWriteSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  cl_command_queue cmd_queue_;
+  cl_mem imageBuffer_;
+
+  unsigned int bufSize_;
+  unsigned int bufnum_;
+  char* memptr;
+  unsigned int memSize_;
+  unsigned int testId_;
+
+  bool skip_;
+};
+
+#endif  // _OCL_3DImageWriteSpeed_H_
@@ -0,0 +1,451 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfAES256.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+static const char *aes256_kernel =
+    "// NOTE: THIS KERNEL WAS ADOPTED FROM SISOFT SANDRA: DO NOT "
+    "REDISTRIBUTE!!\n"
+    "inline uint Load(__global uint* pData, const uint iX, const uint iY)\n"
+    "{\n"
+    "   return pData[iX | (iY << 8)];\n"
+    "}\n"
+    "\n"
+    "\n"
+    "inline uint4 Load4(__global uint* pData, const uint4 uX, const uint iY)\n"
+    "{\n"
+    "   uint  uExtent = iY << 8;\n"
+    "   uint4 uNdx = uX + uExtent;\n"
+    "   \n"
+    "   return (uint4)(pData[uNdx.x], pData[uNdx.y], pData[uNdx.z], "
+    "pData[uNdx.w]);\n"
+    "}\n"
+    "\n"
+    "\n"
+    "__kernel \n"
+    "__attribute__((vec_type_hint(uint4))) \n"
+    "void CryptThread(__global uint4* pInput, __global uint4* pOutput,\n"
+    "                       __global uint* pTables,\n"
+    "                       __global uint4* pKey, const uint iRounds)\n"
+    "{\n"
+    "   const uint iNdx = get_global_id(0);\n"
+    "   \n"
+    "   uint4 state, istate, tstate;\n"
+    "   state = pInput[iNdx] ^ pKey[iRounds];\n"
+    "   \n"
+    "   for (uint i = iRounds-1; i; i--)\n"
+    "   {\n"
+    "       istate = state & 0xFF;\n"
+    "       tstate = Load4(pTables, istate.xyzw, 0);\n"
+    "\n"
+    "       istate = (state >> 8) & 0xFF;\n"
+    "       tstate^= Load4(pTables, istate.wxyz, 1);\n"
+    "\n"
+    "       istate = (state >> 16) & 0xFF;\n"
+    "       tstate^= Load4(pTables, istate.zwxy, 2);\n"
+    "\n"
+    "       istate = state >> 24;\n"
+    "       tstate^= Load4(pTables, istate.yzwx, 3);\n"
+    "\n"
+    "       state = tstate ^ pKey[i];\n"
+    "   }\n"
+    "\n"
+    "   istate = state & 0xFF;\n"
+    "   tstate = Load4(pTables, istate.xyzw, 4);\n"
+    "\n"
+    "   istate = (state >> 8) & 0xFF;\n"
+    "   tstate |= Load4(pTables, istate.wxyz, 4) << 8;\n"
+    "\n"
+    "   istate = (state >> 16) & 0xFF;\n"
+    "   tstate |= Load4(pTables, istate.zwxy, 4) << 16;\n"
+    "\n"
+    "   istate = state >> 24;\n"
+    "   tstate |= Load4(pTables, istate.yzwx, 4) << 24;\n"
+    "\n"
+    "   pOutput[iNdx] = tstate ^ pKey[0];\n"
+    "}\n";
+
+static const char *aes256_kernel2 =
+    "// NOTE: THIS KERNEL WAS ADOPTED FROM SISOFT SANDRA: DO NOT "
+    "REDISTRIBUTE!!\n"
+    "#define AES_BLOCK_SIZE      16\n"
+    "#define AES_TABLE_SIZE      256\n"
+    "\n"
+    "#define AES_TABLE_MAX       5\n"
+    "#define AES_CONST_SIZE      (AES_TABLE_SIZE*AES_TABLE_MAX)\n"
+    "\n"
+    "#define AES_ROUND_128       10\n"
+    "#define AES_ROUND_192       12\n"
+    "#define AES_ROUND_256       14\n"
+    "#define AES_ROUNDKEY_MAX    (AES_BLOCK_SIZE/4*(AES_ROUND_256+1))\n"
+    "#define _IS_GPU_\n"
+    "\n"
+    "\n"
+    "inline uint Load(\n"
+    "#ifdef _IS_GPU_\n"
+    "    __local uint* pData,\n"
+    "#else\n"
+    "    __constant uint* pData,\n"
+    "#endif\n"
+    "    const uint iX, const uint iY)\n"
+    "{\n"
+    "    const uint uNdx = iX + iY*AES_TABLE_SIZE;\n"
+    "    return pData[uNdx];\n"
+    "}\n"
+    "\n"
+    "\n"
+    "inline uint4 Load4(\n"
+    "#ifdef _IS_GPU_\n"
+    "    __local uint* pData,\n"
+    "#else\n"
+    "    __constant uint* pData,\n"
+    "#endif\n"
+    "    const uint4 uX, const uint iY)\n"
+    "{\n"
+    "    const uint  uExtent = iY*AES_TABLE_SIZE;\n"
+    "    const uint4 uNdx = uX + uExtent;\n"
+    "    \n"
+    "    return (uint4)(pData[uNdx.x], pData[uNdx.y], pData[uNdx.z], "
+    "pData[uNdx.w]);\n"
+    "}\n"
+    "\n"
+    "\n"
+    "__kernel \n"
+    "__attribute__((vec_type_hint(uint4)))\n"
+    "#ifdef KERNEL_MAX_THREADS\n"
+    "__attribute__((work_group_size_hint(KERNEL_MAX_THREADS, 1, 1)))\n"
+    "#endif\n"
+    "void CryptThread(__global const uint4* pInput, __global uint4* pOutput,\n"
+    "                        __constant uint* pTables,\n"
+    "                        __constant uint4* pKey, const uint iRounds)\n"
+    "{\n"
+    "    const size_t iNdx = get_global_id(0);\n"
+    "\n"
+    "#ifdef _IS_GPU_\n"
+    "    #define Load4T(x, y)    Load4(ulTables, x, y)\n"
+    "\n"
+    "    __local uint  ulTables[AES_CONST_SIZE];\n"
+    "\n"
+    "    const uint iLdx = get_local_id(0);\n"
+    "    if (iLdx < AES_TABLE_SIZE) {\n"
+    "        const uint iGrps = get_local_size(0);\n"
+    "        const uint iLSize = min(iGrps, (uint)AES_TABLE_SIZE);\n"
+    "        const uint iBpL = AES_CONST_SIZE/iLSize;\n"
+    "\n"
+    "        const uint iStart = iLdx*iBpL;\n"
+    "        const uint iEnd   = iStart + iBpL;\n"
+    "\n"
+    "        for (uint i=iStart; i<iEnd; i++) {\n"
+    "            ulTables[i] = pTables[i];\n"
+    "        }\n"
+    "    }\n"
+    "\n"
+    "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "#else\n"
+    "    #define Load4T(x, y)    Load4(pTables, x, y)\n"
+    "#endif\n"
+    "    \n"
+    "    uint4 state, istate, tstate;\n"
+    "    state = pInput[iNdx] ^ pKey[0];\n"
+    "    \n"
+    "    for (uint i = 1; i < iRounds; i++)\n"
+    "    {\n"
+    "        istate = state & 0xFF;\n"
+    "        tstate = Load4T(istate.xyzw, 0);\n"
+    "\n"
+    "        istate = (state >> 8) & 0xFF;\n"
+    "        tstate^= Load4T(istate.yzwx, 1);\n"
+    "\n"
+    "        istate = (state >> 16) & 0xFF;\n"
+    "        tstate^= Load4T(istate.zwxy, 2);\n"
+    "\n"
+    "        istate = state >> 24;\n"
+    "        tstate^= Load4T(istate.wxyz, 3);\n"
+    "\n"
+    "        state = tstate ^ pKey[i];\n"
+    "    }\n"
+    "\n"
+    "    istate = state & 0xFF;\n"
+    "    tstate = Load4T(istate.xyzw, 4);\n"
+    "\n"
+    "    istate = (state >> 8) & 0xFF;\n"
+    "    tstate |= Load4T(istate.yzwx, 4) << 8;\n"
+    "\n"
+    "    istate = (state >> 16) & 0xFF;\n"
+    "    tstate |= Load4T(istate.zwxy, 4) << 16;\n"
+    "\n"
+    "    istate = state >> 24;\n"
+    "    tstate |= Load4T(istate.wxyz, 4) << 24;\n"
+    "\n"
+    "    pOutput[iNdx] = tstate ^ pKey[iRounds];\n"
+    "}\n";
+
+OCLPerfAES256::OCLPerfAES256() { _numSubTests = 2; }
+
+OCLPerfAES256::~OCLPerfAES256() {}
+
+void OCLPerfAES256::setData(cl_mem buffer, unsigned int val) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  for (unsigned int i = 0; i < bufSize_ / sizeof(unsigned int); i++)
+    data[i] = val;
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+  _wrapper->clFinish(cmd_queue_);
+}
+
+void OCLPerfAES256::checkData(cl_mem buffer) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  for (unsigned int i = 0; i < bufSize_ / sizeof(unsigned int); i++) {
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+  _wrapper->clFinish(cmd_queue_);
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfAES256::open(unsigned int test, char *units, double &conversion,
+                         unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  inBuffer_ = 0;
+  outBuffer_ = 0;
+  tableBuffer_ = 0;
+  keyBuffer_ = 0;
+  blockSize_ = 1024;
+  maxIterations = 50;
+
+  bufSize_ = 5592320 * sizeof(cl_uint4);
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0,
+               "Couldn't find platform with GPU devices, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  // Increase iterations for devices with many CUs
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
+                                     sizeof(size_t), &numCUs, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  maxIterations *= (unsigned int)(1 + 10 * numCUs / 20);
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  inBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, bufSize_,
+                                       NULL, &error_);
+  CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed");
+
+  outBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, bufSize_,
+                                        NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  tableBuffer_ =
+      _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, 5120, NULL, &error_);
+  CHECK_RESULT(tableBuffer_ == 0, "clCreateBuffer(tableBuffer) failed");
+
+  keyBuffer_ =
+      _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, 240, NULL, &error_);
+  CHECK_RESULT(keyBuffer_ == 0, "clCreateBuffer(keyBuffer) failed");
+
+  if (_openTest == 0) {
+    program_ = _wrapper->clCreateProgramWithSource(
+        context_, 1, (const char **)&aes256_kernel, NULL, &error_);
+    CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+    testDescString += "orig";
+  } else {
+    program_ = _wrapper->clCreateProgramWithSource(
+        context_, 1, (const char **)&aes256_kernel2, NULL, &error_);
+    CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+    testDescString += " new";
+  }
+
+  const char *buildOps = NULL;
+  error_ = _wrapper->clBuildProgram(program_, 1, &device, buildOps, NULL, NULL);
+
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "CryptThread", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  cl_uint rounds = 14;
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&inBuffer_);
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_);
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_mem),
+                                    (void *)&tableBuffer_);
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_mem), (void *)&keyBuffer_);
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_uint), (void *)&rounds);
+  setData(inBuffer_, 0xdeadbeef);
+  setData(outBuffer_, 0xdeadbeef);
+}
+
+void OCLPerfAES256::run(void) {
+  int global = bufSize_ / sizeof(cl_uint4);
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  CPerfCounter timer;
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < maxIterations; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+  }
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // No idea what data should be in here
+  // checkData(outBuffer_);
+  // Compute GB/s
+  double perf =
+      ((double)bufSize_ * (double)maxIterations * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+}
+
+unsigned int OCLPerfAES256::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+
+  if (inBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(inBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(inBuffer_) failed");
+  }
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (tableBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(tableBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(tableBuffer_) failed");
+  }
+  if (keyBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(keyBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(keyBuffer_) failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
@@ -0,0 +1,58 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_AES256_H_
+#define _OCL_AES256_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfAES256 : public OCLTestImp {
+ public:
+  OCLPerfAES256();
+  virtual ~OCLPerfAES256();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void setData(cl_mem buffer, unsigned int data);
+  void checkData(cl_mem buffer);
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_mem inBuffer_;
+  cl_mem outBuffer_;
+  cl_mem tableBuffer_;
+  cl_mem keyBuffer_;
+  cl_int error_;
+
+  unsigned int width_;
+  unsigned int bufSize_;
+  unsigned int blockSize_;
+  unsigned int maxIterations;
+  size_t numCUs;
+};
+
+#endif  // _OCL_AES256_H_
@@ -0,0 +1,817 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfAtomicSpeed.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+#include "CL/cl.h"
+#include "OCLPerfAtomicSpeedKernels.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+// Define the test suite tests.
+testOCLPerfAtomicSpeedStruct testOCLPerfAtomicSpeedList[] = {
+    {LocalHistogram, 1},
+    {LocalHistogram, 2},
+    {LocalHistogram, 4},
+    {GlobalHistogram, 1},
+    {GlobalHistogram, 2},
+    {GlobalHistogram, 4},
+    {Global4Histogram, 1},
+    {Global4Histogram, 2},
+    {Global4Histogram, 4},
+    {LocalReductionNoAtomics, 1},
+    {LocalReductionNoAtomics, 2},
+    {LocalReductionNoAtomics, 4},
+    {LocalReductionAtomics, 1},
+    {LocalReductionAtomics, 2},
+    {LocalReductionAtomics, 4},
+    {Local4ReductionNoAtomics, 1},
+    {Local4ReductionNoAtomics, 2},
+    {Local4ReductionNoAtomics, 4},
+    /*    {Local4ReductionAtomics, 1},
+        {Local4ReductionAtomics, 2},
+        {Local4ReductionAtomics, 4},*/
+    {GlobalWGReduction, 1},
+    {GlobalWGReduction, 2},
+    {GlobalWGReduction, 4},
+    {GlobalAllToZeroReduction, 1},
+    {GlobalAllToZeroReduction, 2},
+    {GlobalAllToZeroReduction, 4},
+    {Global4WGReduction, 1},
+    {Global4WGReduction, 2},
+    {Global4WGReduction, 4},
+    {Global4AllToZeroReduction, 1},
+    {Global4AllToZeroReduction, 2},
+    {Global4AllToZeroReduction, 4},
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// OCLPerfAtomicSpeed implementation.
+///////////////////////////////////////////////////////////////////////////////
+OCLPerfAtomicSpeed::OCLPerfAtomicSpeed() {
+  _atomicsSupported = false;
+  _dataSizeTooBig = false;
+  _numSubTests =
+      sizeof(testOCLPerfAtomicSpeedList) / sizeof(testOCLPerfAtomicSpeedStruct);
+  _numLoops = 10;
+  _nCurrentInputScale = 1;
+  _maxMemoryAllocationSize = 0;
+
+  _input = NULL;
+  _output = NULL;
+  _inputBuffer = NULL;
+  _outputBuffer = NULL;
+  _workgroupSize = 256;
+  _programs.clear();
+  _kernels.clear();
+}
+
+OCLPerfAtomicSpeed::~OCLPerfAtomicSpeed() {}
+
+void OCLPerfAtomicSpeed::open(unsigned int test, char *units,
+                              double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_int status = CL_SUCCESS;
+
+  device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+  _cpuReductionSum = 0;
+  _nCurrentInputScale = testOCLPerfAtomicSpeedList[_openTest].inputScale;
+  AtomicType atomicType = testOCLPerfAtomicSpeedList[_openTest].atomicType;
+
+  // Setup stuff...
+  setupHistogram();
+  calculateHostBin();
+
+  context_ = 0;
+  cmd_queue_ = 0;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+    // Get last for default
+#if 0
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+#if 0
+            if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+                platform = platforms[i];
+                break;
+            }
+#endif
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+#if 0
+                if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+                    isAMD = true;
+                }
+#endif
+      platform = platforms[_platformIndex];
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0,
+               "Couldn't find platform with GPU devices, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, NULL, NULL, &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  // Global memory size
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                                     sizeof(cl_ulong),
+                                     &_maxMemoryAllocationSize, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS,
+               "clGetDeviceIDs(CL_DEVICE_GLOBAL_MEM_SIZE) failed");
+
+  // Check that the test size is not too big for the current GPU.
+  _dataSizeTooBig = false;
+  cl_ulong tenMB = 1024 * 10240;
+  if (_inputNBytes >= (_maxMemoryAllocationSize - tenMB)) {
+    _dataSizeTooBig = true;
+    return;
+  }
+
+  char *p = strstr(charbuf, "cl_khr_global_int32_base_atomics");
+  char *p2 = strstr(charbuf, "cl_khr_local_int32_base_atomics");
+
+  _atomicsSupported = false;
+  if (p || p2) _atomicsSupported = true;
+
+  // Verify atomics are supported.
+  if (!_atomicsSupported) return;
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  // Create buffers...
+  _inputBuffer =
+      clCreateBuffer(context_, CL_MEM_READ_ONLY, _inputNBytes, 0, &status);
+  CHECK_RESULT(status, "clCreateBuffer failed. (inputBuffer)");
+
+  // Create the programs/kernels for the current test type.
+  CreateKernels(atomicType);
+
+  _nThreadsPerGroup = _workgroupSize;
+  _nGroups = _nThreads / _nThreadsPerGroup;
+  _outputNBytes = _nGroups * NBINS * sizeof(cl_uint);
+  if (IsReduction(atomicType)) _outputNBytes = _inputNBytes;
+
+  _output = (cl_uint *)malloc(_outputNBytes);
+  if (0 == _output) {
+    _dataSizeTooBig = true;
+    return;
+  }
+
+  // Create output Buffer
+  _outputBuffer =
+      clCreateBuffer(context_, CL_MEM_READ_WRITE, _outputNBytes, 0, &status);
+  CHECK_RESULT(status, "clCreateBuffer failed. (outputBuffer)");
+}
+
+// Create the programs/kernels for the current test type.
+void OCLPerfAtomicSpeed::CreateKernels(const AtomicType atomicType) {
+  char log[16384];
+  cl_kernel kernel_;
+  cl_program program_;
+  char buildOptions[1000];
+  cl_int status = CL_SUCCESS;
+
+  SNPRINTF(buildOptions, sizeof(buildOptions),
+           "-D NBINS=%d -D BITS_PER_PIX=%d -D NBANKS=%d", NBINS, BITS_PER_PIX,
+           NBANKS);
+
+  // Create the programs.
+  switch (atomicType) {
+    case LocalHistogram:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&local_atomics_histogram, NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&local_atomics_reduce, NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    case LocalReductionNoAtomics:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&local_reduction, NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    case Local4ReductionNoAtomics:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&local_vec4_reduction, NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    case LocalReductionAtomics:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&local_atomics_reduction, NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    case Local4ReductionAtomics:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&local_vec4_atomics_reduction, NULL,
+          &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    case GlobalHistogram:
+    case Global4Histogram:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&global_atomics_histogram, NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    case GlobalWGReduction:
+    case Global4WGReduction:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&global_atomics_sum_reduction_workgroup,
+          NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    case GlobalAllToZeroReduction:
+    case Global4AllToZeroReduction:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&global_atomics_sum_reduction_all_to_zero,
+          NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    default:
+      CHECK_RESULT(true, "Atomic type not supported (clCreateProgram)");
+  }
+  // Build the programs.
+  for (size_t i = 0; i < _programs.size(); i++) {
+    error_ = _wrapper->clBuildProgram(_programs[i], 1, &device, buildOptions,
+                                      NULL, NULL);
+    if (error_ != CL_SUCCESS) {
+      status = _wrapper->clGetProgramBuildInfo(_programs[i], device,
+                                               CL_PROGRAM_BUILD_LOG,
+                                               16384 * sizeof(char), log, NULL);
+      printf("Build error -> %s\n", log);
+
+      CHECK_RESULT(0, "clBuildProgram failed");
+    }
+  }
+
+  switch (atomicType) {
+    case LocalHistogram:
+      kernel_ = _wrapper->clCreateKernel(_programs[0],
+                                         "local_atomics_histogram", &error_);
+      CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+      _kernels.push_back(kernel_);
+      kernel_ = _wrapper->clCreateKernel(_programs[1], "local_atomics_reduce",
+                                         &error_);
+      CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+      _kernels.push_back(kernel_);
+      break;
+    case LocalReductionNoAtomics:
+    case Local4ReductionNoAtomics:
+    case LocalReductionAtomics:
+    case Local4ReductionAtomics:
+      kernel_ =
+          _wrapper->clCreateKernel(_programs[0], "local_reduction", &error_);
+      CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+      _kernels.push_back(kernel_);
+      break;
+    case GlobalHistogram:
+    case Global4Histogram:
+      kernel_ = _wrapper->clCreateKernel(_programs[0],
+                                         "global_atomics_histogram", &error_);
+      CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+      _kernels.push_back(kernel_);
+      break;
+    case GlobalWGReduction:
+    case Global4WGReduction:
+      kernel_ = _wrapper->clCreateKernel(
+          _programs[0], "global_atomics_sum_reduction_workgroup", &error_);
+      CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+      _kernels.push_back(kernel_);
+      break;
+    case GlobalAllToZeroReduction:
+    case Global4AllToZeroReduction:
+      kernel_ = _wrapper->clCreateKernel(
+          _programs[0], "global_atomics_sum_reduction_all_to_zero", &error_);
+      CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+      _kernels.push_back(kernel_);
+      break;
+    default:
+      CHECK_RESULT(true, "Atomic type not supported (clCreateKernel)");
+  }
+}
+
+// Sets the kernel arguments based on the current test type.
+void OCLPerfAtomicSpeed::SetKernelArguments(const AtomicType atomicType) {
+  int Arg = 0;
+  int localSize = 0;
+  int itemsPerThread = 1;
+  cl_int status = CL_SUCCESS;
+
+  switch (atomicType) {
+    case LocalHistogram:
+      // Set arguments for the local atomics histogram kernel
+      status = _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
+                                        (void *)&_inputBuffer);
+      CHECK_RESULT(status, "clSetKernelArg failed. (inputBuffer)");
+
+      status |= _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
+                                         (void *)&_outputBuffer);
+      CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)");
+
+      status |= _wrapper->clSetKernelArg(_kernels[0], Arg++,
+                                         sizeof(_n4VectorsPerThread),
+                                         (void *)&_n4VectorsPerThread);
+      CHECK_RESULT(status, "clSetKernelArg failed. (n4VectorsPerThread)");
+
+      // Set arguments for the local atomics reduce kernel
+      Arg = 0;
+      status |= _wrapper->clSetKernelArg(_kernels[1], Arg++, sizeof(cl_mem),
+                                         (void *)&_outputBuffer);
+      CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)");
+
+      status |= _wrapper->clSetKernelArg(_kernels[1], Arg++, sizeof(_nGroups),
+                                         (void *)&_nGroups);
+      CHECK_RESULT(status, "clSetKernelArg failed. (nGroups)");
+      break;
+    case LocalReductionAtomics:
+    case LocalReductionNoAtomics:
+    case Local4ReductionNoAtomics:
+    case Local4ReductionAtomics:
+      status = _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
+                                        (void *)&_inputBuffer);
+      CHECK_RESULT(status, "clSetKernelArg failed. (inputBuffer)");
+
+      status |= _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
+                                         (void *)&_outputBuffer);
+      CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)");
+
+      localSize = DEFAULT_WG_SIZE * sizeof(cl_uint);
+      if ((Local4ReductionNoAtomics == atomicType) ||
+          (Local4ReductionAtomics == atomicType))
+        localSize *= 4;
+      status = _wrapper->clSetKernelArg(_kernels[0], Arg++, localSize, NULL);
+      CHECK_RESULT(status, "clSetKernelArg failed. (local memory)");
+      break;
+    case GlobalHistogram:
+    case Global4Histogram:
+    case GlobalWGReduction:
+    case Global4WGReduction:
+    case GlobalAllToZeroReduction:
+    case Global4AllToZeroReduction:
+      // Set arguments for the global atomics histogram kernel
+      if ((Global4Histogram == atomicType) ||
+          (Global4WGReduction == atomicType) ||
+          (Global4AllToZeroReduction == atomicType))
+        itemsPerThread = 4;
+
+      status = _wrapper->clSetKernelArg(
+          _kernels[0], Arg++, sizeof(itemsPerThread), (void *)&itemsPerThread);
+      CHECK_RESULT(status, "clSetKernelArg failed. (itemsPerThread)");
+
+      status = _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
+                                        (void *)&_inputBuffer);
+      CHECK_RESULT(status, "clSetKernelArg failed. (inputBuffer)");
+
+      status |= _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
+                                         (void *)&_outputBuffer);
+      CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)");
+      break;
+    default:
+      CHECK_RESULT(true, "Atomic type not supported (clSetKernelArg)");
+  }
+}
+
+// Since we write multiple times to the output in global atomics, need to
+// reset the content every time.
+void OCLPerfAtomicSpeed::ResetGlobalOutput() {
+  cl_int status;
+
+  memset(_output, 0, _outputNBytes);
+
+  status =
+      _wrapper->clEnqueueWriteBuffer(cmd_queue_, _outputBuffer, CL_TRUE, 0,
+                                     _outputNBytes, _output, 0, NULL, NULL);
+  CHECK_RESULT(status, "clEnqueueWriteBuffer failed.");
+
+  status = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+}
+
+// Run the local histogram kernels.
+void OCLPerfAtomicSpeed::RunLocalHistogram() {
+  cl_uint status;
+  cl_event events[2];
+  size_t globalThreads[3] = {1};
+  size_t localThreads[3] = {1};
+  size_t globalThreadsReduce = NBINS;
+  size_t localThreadsReduce = _nThreadsPerGroup;
+
+  globalThreads[0] = _nThreads;
+  localThreads[0] = _nThreadsPerGroup;
+
+  status = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, _kernels[0], 1, NULL,
+                                            globalThreads, localThreads, 0,
+                                            NULL, &events[0]);
+  CHECK_RESULT(status, "clEnqueueNDRangeKernel failed. (histogram)");
+
+  status = _wrapper->clEnqueueNDRangeKernel(
+      cmd_queue_, _kernels[1], 1, NULL, &globalThreadsReduce,
+      &localThreadsReduce, 1, &events[0], &events[1]);
+  CHECK_RESULT(status, "clEnqueueNDRangeKernel failed. (reduce)");
+
+  status = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+
+  status = _wrapper->clWaitForEvents(1, &events[0]);
+  status |= _wrapper->clWaitForEvents(1, &events[1]);
+  CHECK_RESULT(status, "clWaitForEvents failed.");
+}
+
+// Run the local reduction kernel.
+void OCLPerfAtomicSpeed::RunLocalReduction(const AtomicType atomicType) {
+  cl_uint status;
+  size_t globalThreads[3] = {1};
+  size_t localThreads[3] = {1};
+
+  globalThreads[0] = _inputNBytes / sizeof(cl_uint) / 2;
+  localThreads[0] = _nThreadsPerGroup;
+  if ((Local4ReductionNoAtomics == atomicType) ||
+      (Local4ReductionAtomics == atomicType))
+    globalThreads[0] /= 4;
+
+  status = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, _kernels[0], 1, NULL,
+                                            globalThreads, localThreads, 0,
+                                            NULL, NULL);
+  CHECK_RESULT(status, "clEnqueueNDRangeKernel failed. (reduction)");
+
+  status = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+}
+
+// Run the global histogram kernel.
+void OCLPerfAtomicSpeed::RunGlobalHistogram(AtomicType atomicType) {
+  cl_uint status;
+  size_t globalThreads[3] = {1};
+  size_t localThreads[3] = {1};
+
+  globalThreads[0] = _inputNBytes / sizeof(cl_uint);
+  localThreads[0] = _nThreadsPerGroup;
+
+  if ((Global4Histogram == atomicType) || (Global4WGReduction == atomicType) ||
+      (Global4AllToZeroReduction == atomicType))
+    globalThreads[0] /= 4;
+
+  status = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, _kernels[0], 1, NULL,
+                                            globalThreads, localThreads, 0,
+                                            NULL, NULL);
+  CHECK_RESULT(status, "clEnqueueNDRangeKernel failed.");
+
+  status = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+}
+
+// Run the AtomicSpeed logic.
+void OCLPerfAtomicSpeed::run() {
+  int Arg = 0;
+  cl_uint status;
+  AtomicType atomicType = testOCLPerfAtomicSpeedList[_openTest].atomicType;
+
+  // Verify atomics are supported.
+  if ((!_atomicsSupported) || (_dataSizeTooBig)) return;
+
+  // Write data to the GPU
+  status = _wrapper->clEnqueueWriteBuffer(cmd_queue_, _inputBuffer, CL_FALSE, 0,
+                                          _inputNBytes, _input, 0, NULL, NULL);
+  CHECK_RESULT(status, "clEnqueueWriteBuffer failed. (inputBuffer)");
+
+  status = _wrapper->clFlush(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+
+  // Set the current arguments based on the test type.
+  SetKernelArguments(atomicType);
+
+  // Run the kernels.
+  CPerfCounter timer;
+  double totalTime = 0.0f;
+
+  for (unsigned int k = 0; k < _numLoops + 1; k++) {
+    // Since we run multiple times using global atomics the output
+    // would get accumulated therefore first clean it.
+    ResetGlobalOutput();
+
+    timer.Reset();
+    timer.Start();
+    switch (atomicType) {
+      case LocalHistogram:
+        RunLocalHistogram();
+        break;
+      case LocalReductionAtomics:
+      case LocalReductionNoAtomics:
+      case Local4ReductionNoAtomics:
+      case Local4ReductionAtomics:
+        RunLocalReduction(atomicType);
+        break;
+      case GlobalHistogram:
+      case Global4Histogram:
+      case GlobalWGReduction:
+      case Global4WGReduction:
+      case GlobalAllToZeroReduction:
+      case Global4AllToZeroReduction:
+        RunGlobalHistogram(atomicType);
+        break;
+      default:
+        CHECK_RESULT(true, "Atomic type not supported");
+    }
+    timer.Stop();
+    // Don't count the warm-up
+    if (0 != k) totalTime += timer.GetElapsedTime();
+  }
+
+  // Read the results back to the CPU - Only do it for the last run
+  // of the test instead of for each iteration of _numLoops.
+  status = _wrapper->clEnqueueReadBuffer(cmd_queue_, _outputBuffer, CL_FALSE, 0,
+                                         _outputNBytes, _output, 0, NULL, NULL);
+  CHECK_RESULT(status, "clEnqueueReadBuffer failed.");
+  status = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+
+  // Print the results.
+  PrintResults(atomicType, totalTime);
+
+  // Check the results for the current test.
+  _errorFlag = !(VerifyResults(atomicType));
+}
+
+// Compare the results and see if they match
+bool OCLPerfAtomicSpeed::VerifyResults(const AtomicType atomicType) {
+  cl_uint i = 0;
+  bool flag = true;
+  cl_uint calculatedValue = 0;
+  cl_uint reductionElementCount = 0;
+  switch (atomicType) {
+    case LocalHistogram:
+    case GlobalHistogram:
+    case Global4Histogram:
+      for (i = 0; i < NBINS; ++i) {
+        if (_cpuhist[i] != _output[i]) {
+          flag = false;
+          break;
+        }
+      }
+      break;
+    case LocalReductionAtomics:
+    case LocalReductionNoAtomics:
+    case Local4ReductionNoAtomics:
+    case Local4ReductionAtomics:
+    case GlobalWGReduction:
+    case Global4WGReduction:
+      reductionElementCount =
+          _inputNBytes / sizeof(cl_uint) / _nThreadsPerGroup;
+      for (i = 0; i < reductionElementCount; i++) {
+        calculatedValue += _output[i];
+      }
+      flag = (calculatedValue == _cpuReductionSum);
+      break;
+    case GlobalAllToZeroReduction:
+    case Global4AllToZeroReduction:
+      flag = (_output[0] == _cpuReductionSum);
+      break;
+    default:
+      CHECK_RESULT_NO_RETURN(true, "Atomic type not supported (VerifyResults)");
+      return false;
+  }
+  if (!flag) printf("WRONG VALUES!!!!!");
+  return flag;
+}
+
+unsigned int OCLPerfAtomicSpeed::close() {
+  size_t i = 0;
+  for (; i < _kernels.size(); i++) {
+    error_ = _wrapper->clReleaseKernel(_kernels[i]);
+  }
+  for (; i < _programs.size(); i++) {
+    error_ = _wrapper->clReleaseProgram(_programs[i]);
+  }
+  if (_inputBuffer) {
+    error_ = clReleaseMemObject(_inputBuffer);
+    CHECK_RESULT_NO_RETURN(error_, "clReleaseMemObject failed.(inputBuffer )");
+  }
+  if (_outputBuffer) {
+    error_ = clReleaseMemObject(_outputBuffer);
+    CHECK_RESULT_NO_RETURN(error_, "clReleaseMemObject failed.(outputBuffer)");
+  }
+
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  // Free host memory.
+  free(_input);
+  free(_output);
+
+  // Reset everything.
+  _kernels.clear();
+  _programs.clear();
+  _inputBuffer = NULL;
+  _outputBuffer = NULL;
+  cmd_queue_ = NULL;
+  context_ = NULL;
+  _input = NULL;
+  _output = NULL;
+
+  return _crcword;
+}
+
+/* Helper functions */
+void OCLPerfAtomicSpeed::calculateHostBin() {
+  // compute CPU histogram
+  cl_int *p = (cl_int *)_input;
+  memset(_cpuhist, 0, NBINS * sizeof(cl_uint));
+  _cpuReductionSum = 0;
+
+  for (unsigned int i = 0; i < _inputNBytes / sizeof(cl_uint); i++) {
+    _cpuhist[(p[i] >> 24) & 0xff]++;
+    _cpuhist[(p[i] >> 16) & 0xff]++;
+    _cpuhist[(p[i] >> 8) & 0xff]++;
+    _cpuhist[(p[i] >> 0) & 0xff]++;
+    _cpuReductionSum += ((p[i] >> 24) & 0x3) + ((p[i] >> 16) & 0x3) +
+                        ((p[i] >> 8) & 0x3) + ((p[i] >> 0) & 0x3);
+  }
+}
+
+void OCLPerfAtomicSpeed::setupHistogram() {
+  cl_int status = 0;
+
+  _nThreads = 64 * 1024;
+#if defined(_WIN32) && !defined(_WIN64)
+  _n4Vectors = 1024 * 1024;
+#else
+  _n4Vectors = 2048 * 2048;
+#endif
+  _n4Vectors *= _nCurrentInputScale;
+  _n4VectorsPerThread = _n4Vectors / _nThreads;
+  _inputNBytes = _n4Vectors * sizeof(cl_uint4);
+
+  _input = (cl_uint *)malloc(_inputNBytes);
+  if (0 == _input) {
+    _dataSizeTooBig = true;
+    return;
+  }
+
+  // random initialization of input
+  time_t ltime;
+  time(&ltime);
+  cl_uint a = (cl_uint)ltime, b = (cl_uint)ltime;
+  cl_uint *p = (cl_uint *)_input;
+
+  for (unsigned int i = 0; i < _inputNBytes / sizeof(cl_uint); i++)
+    p[i] = (b = (a * (b & 65535)) + (b >> 16));
+}
+
+// Print the results of the current test.
+void OCLPerfAtomicSpeed::PrintResults(const AtomicType atomicType,
+                                      double totalTime) {
+  char buf[500];
+  char sAtomicType[100];
+  double inputInGB = (double)_inputNBytes * (double)(1e-09);
+  // each cl_uint in _inputNBytes contributes 4 items.
+  double totalHistogramDataInGB = (double)inputInGB * 4;
+  double perf = totalTime / _numLoops;
+
+  switch (atomicType) {
+    case LocalHistogram:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType), "Local histogram");
+      break;
+    case GlobalHistogram:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType), "Global histogram");
+      break;
+    case Global4Histogram:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType), "Global vec 4 histogram");
+      break;
+    case LocalReductionNoAtomics:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType), "Local reduction NO atomics");
+      break;
+    case Local4ReductionNoAtomics:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType),
+               "Local vec 4 reduction NO atomics");
+      break;
+    case LocalReductionAtomics:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType),
+               "Local reduction with atomics");
+      break;
+    case Local4ReductionAtomics:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType),
+               "Local vec 4 reduction with atomics");
+      break;
+    case GlobalWGReduction:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType), "Global work-group reduction");
+      break;
+    case Global4WGReduction:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType),
+               "Global vec 4 work-group reduction");
+      break;
+    case GlobalAllToZeroReduction:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType),
+               "Global all to zero reduction");
+      break;
+    case Global4AllToZeroReduction:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType),
+               "Global vec 4 all to zero reduction");
+      break;
+    default:
+      CHECK_RESULT(true, "Atomic type not supported (PrintResults)");
+  }
+
+  SNPRINTF(buf, sizeof(buf), "%45s: Input [%.3f GB], Time [%.3f sec]: GB/s",
+           sAtomicType, totalHistogramDataInGB, perf);
+  _perfInfo = (float)(totalHistogramDataInGB / perf);
+  testDescString = buf;
+}
+
+bool OCLPerfAtomicSpeed::IsReduction(const AtomicType atomicType) {
+  return ((atomicType >= LocalReductionNoAtomics) &&
+          (atomicType <= GlobalAllToZeroReduction));
+}
@@ -0,0 +1,119 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_AtomicSpeed_H_
+#define _OCL_AtomicSpeed_H_
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "OCLTestImp.h"
+
+#define DEFAULT_WG_SIZE 256
+#define NBINS 256
+#define BITS_PER_PIX 8
+#define NBANKS 16
+
+// Define the atomic type to test.
+enum AtomicType {
+  LocalHistogram = 0,
+  GlobalHistogram,
+  Global4Histogram,
+  LocalReductionNoAtomics,
+  Local4ReductionNoAtomics,
+  LocalReductionAtomics,
+  Local4ReductionAtomics,
+  GlobalWGReduction,
+  Global4WGReduction,
+  GlobalAllToZeroReduction,
+  Global4AllToZeroReduction,
+};
+
+typedef struct {
+  AtomicType atomicType;
+  int inputScale;
+} testOCLPerfAtomicSpeedStruct;
+
+// Define the OCLPerfAtomicSpeed class.
+class OCLPerfAtomicSpeed : public OCLTestImp {
+ public:
+  OCLPerfAtomicSpeed();
+  virtual ~OCLPerfAtomicSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  std::vector<cl_program> _programs;
+  std::vector<cl_kernel> _kernels;
+  cl_device_id device;
+
+  bool _atomicsSupported;
+  bool _dataSizeTooBig;
+  cl_uint _numLoops;
+
+  // Histogram related stuff...
+ private:
+  cl_ulong _maxMemoryAllocationSize;
+  cl_uint _inputNBytes;
+  cl_uint _outputNBytes;
+
+  cl_uint _nCurrentInputScale;
+  cl_uint _workgroupSize;
+  //    cl_uint nLoops;
+  cl_uint _nThreads;
+  cl_uint _nThreadsPerGroup;
+  cl_uint _nGroups;
+  cl_uint _n4Vectors;
+  cl_uint _n4VectorsPerThread;
+  cl_uint _nBins;
+  cl_uint _nBytesLDSPerGrp;
+
+  cl_uint* _input;
+  cl_uint* _output;
+  cl_mem _inputBuffer;
+  cl_mem _outputBuffer;
+
+  cl_uint _cpuhist[NBINS];
+  cl_uint _cpuReductionSum;
+
+  void calculateHostBin();
+  void setupHistogram();
+  bool VerifyResults(const AtomicType atomicType);
+  void ResetGlobalOutput();
+
+  // Methods that does the actual NDRange.
+  void RunLocalHistogram();
+  void RunLocalReduction(const AtomicType atomicType);
+  void RunGlobalHistogram(const AtomicType atomicType);
+
+  void CreateKernels(const AtomicType atomicType);
+  bool IsReduction(const AtomicType atomicType);
+  void SetKernelArguments(const AtomicType atomicType);
+  void PrintResults(const AtomicType atomicType, double totalTime);
+};
+
+#endif  // _OCL_AtomicSpeed_H_
@@ -0,0 +1,509 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfAtomicSpeed20.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+#include "CL/cl.h"
+#include "OCLPerfAtomicSpeed20Kernels.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+// Define the test suite tests.
+testOCLPerfAtomicSpeed20Struct testOCLPerfAtomicSpeed20List[] = {
+    {GlobalWGReduction, 1},         {GlobalWGReduction, 2},
+    {GlobalWGReduction, 4},         {GlobalAllToZeroReduction, 1},
+    {GlobalAllToZeroReduction, 2},  {GlobalAllToZeroReduction, 4},
+    {Global4WGReduction, 1},        {Global4WGReduction, 2},
+    {Global4WGReduction, 4},        {Global4AllToZeroReduction, 1},
+    {Global4AllToZeroReduction, 2}, {Global4AllToZeroReduction, 4},
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// OCLPerfAtomicSpeed20 implementation.
+///////////////////////////////////////////////////////////////////////////////
+OCLPerfAtomicSpeed20::OCLPerfAtomicSpeed20() {
+  _atomicsSupported = false;
+  _dataSizeTooBig = false;
+  _numSubTests = sizeof(testOCLPerfAtomicSpeed20List) /
+                 sizeof(testOCLPerfAtomicSpeed20Struct);
+  _numLoops = 10;
+  _nCurrentInputScale = 1;
+  _maxMemoryAllocationSize = 0;
+
+  _input = NULL;
+  _output = NULL;
+  _inputBuffer = NULL;
+  _outputBuffer = NULL;
+
+  skip_ = false;
+
+  _workgroupSize = 256;
+  _programs.clear();
+  _kernels.clear();
+}
+
+OCLPerfAtomicSpeed20::~OCLPerfAtomicSpeed20() {}
+
+void OCLPerfAtomicSpeed20::open(unsigned int test, char *units,
+                                double &conversion, unsigned int deviceId) {
+  error_ = CL_SUCCESS;
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  program_ = 0;
+  kernel_ = 0;
+
+#if defined(CL_VERSION_2_0)
+  cl_device_id device;
+  cl_int status = CL_SUCCESS;
+
+  conversion = 1.0f;
+  _openTest = test;
+  _cpuReductionSum = 0;
+  _nCurrentInputScale = testOCLPerfAtomicSpeed20List[_openTest].inputScale;
+  AtomicType atomicType = testOCLPerfAtomicSpeed20List[_openTest].atomicType;
+
+  // Setup stuff...
+  setupHistogram();
+  calculateHostBin();
+
+  device = devices_[_deviceId];
+
+  cmd_queue_ = cmdQueues_[_deviceId];
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  // Global memory size
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                                     sizeof(cl_ulong),
+                                     &_maxMemoryAllocationSize, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS,
+               "clGetDeviceInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE) failed");
+
+  // Check that the test size is not too big for the current GPU.
+  _dataSizeTooBig = false;
+  cl_ulong tenMB = 1024 * 10240;
+  if (_inputNBytes >= (_maxMemoryAllocationSize - tenMB)) {
+    _dataSizeTooBig = true;
+    return;
+  }
+
+  char *p = strstr(charbuf, "cl_khr_global_int32_base_atomics");
+
+  _atomicsSupported = false;
+  if (p) _atomicsSupported = true;
+
+  // Verify atomics are supported.
+  if (!_atomicsSupported) return;
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  // Create buffers...
+  _inputBuffer =
+      clCreateBuffer(context_, CL_MEM_READ_ONLY, _inputNBytes, 0, &status);
+  CHECK_RESULT(status, "clCreateBuffer failed. (inputBuffer)");
+
+  // Create the programs/kernels for the current test type.
+  CreateKernels(atomicType);
+
+  _nThreadsPerGroup = _workgroupSize;
+  _nGroups = _nThreads / _nThreadsPerGroup;
+  _outputNBytes = _inputNBytes;
+
+  _output = (cl_uint *)malloc(_outputNBytes);
+  if (0 == _output) {
+    _dataSizeTooBig = true;
+    return;
+  }
+
+  // Create output Buffer
+  _outputBuffer =
+      clCreateBuffer(context_, CL_MEM_READ_WRITE, _outputNBytes, 0, &status);
+  CHECK_RESULT(status, "clCreateBuffer failed. (outputBuffer)");
+#else
+  skip_ = true;
+  testDescString = "OpenCL verion < 2.0. Test Skipped.";
+  return;
+#endif
+}
+
+// Create the programs/kernels for the current test type.
+void OCLPerfAtomicSpeed20::CreateKernels(const AtomicType atomicType) {
+  char log[16384];
+  cl_kernel kernel_;
+  cl_program program_;
+  char buildOptions[1000];
+  cl_int status = CL_SUCCESS;
+  cl_device_id device = devices_[_deviceId];
+
+  SNPRINTF(buildOptions, sizeof(buildOptions),
+           "-cl-std=CL2.0 -D NBINS=%d -D BITS_PER_PIX=%d -D NBANKS=%d", NBINS,
+           BITS_PER_PIX, NBANKS);
+
+  // Create the programs.
+  switch (atomicType) {
+    case GlobalWGReduction:
+    case Global4WGReduction:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&global_atomics_sum_reduction_workgroup,
+          NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    case GlobalAllToZeroReduction:
+    case Global4AllToZeroReduction:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&global_atomics_sum_reduction_all_to_zero,
+          NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    default:
+      CHECK_RESULT(true, "Atomic type not supported (clCreateProgram)");
+  }
+  // Build the programs.
+  for (size_t i = 0; i < _programs.size(); i++) {
+    error_ = _wrapper->clBuildProgram(_programs[i], 1, &device, buildOptions,
+                                      NULL, NULL);
+    if (error_ != CL_SUCCESS) {
+      status = _wrapper->clGetProgramBuildInfo(_programs[i], device,
+                                               CL_PROGRAM_BUILD_LOG,
+                                               16384 * sizeof(char), log, NULL);
+      printf("Build error -> %s\n", log);
+
+      CHECK_RESULT(0, "clBuildProgram failed");
+    }
+  }
+
+  switch (atomicType) {
+    case GlobalWGReduction:
+    case Global4WGReduction:
+      kernel_ = _wrapper->clCreateKernel(
+          _programs[0], "global_atomics_sum_reduction_workgroup", &error_);
+      CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+      _kernels.push_back(kernel_);
+      break;
+    case GlobalAllToZeroReduction:
+    case Global4AllToZeroReduction:
+      kernel_ = _wrapper->clCreateKernel(
+          _programs[0], "global_atomics_sum_reduction_all_to_zero", &error_);
+      CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+      _kernels.push_back(kernel_);
+      break;
+    default:
+      CHECK_RESULT(true, "Atomic type not supported (clCreateKernel)");
+  }
+}
+
+// Sets the kernel arguments based on the current test type.
+void OCLPerfAtomicSpeed20::SetKernelArguments(const AtomicType atomicType) {
+  int Arg = 0;
+  int localSize = 0;
+  int itemsPerThread = 1;
+  cl_int status = CL_SUCCESS;
+
+  switch (atomicType) {
+    case GlobalWGReduction:
+    case Global4WGReduction:
+    case GlobalAllToZeroReduction:
+    case Global4AllToZeroReduction:
+      // Set arguments for the global atomics histogram kernel
+      if ((Global4WGReduction == atomicType) ||
+          (Global4AllToZeroReduction == atomicType))
+        itemsPerThread = 4;
+
+      status = _wrapper->clSetKernelArg(
+          _kernels[0], Arg++, sizeof(itemsPerThread), (void *)&itemsPerThread);
+      CHECK_RESULT(status, "clSetKernelArg failed. (itemsPerThread)");
+
+      status = _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
+                                        (void *)&_inputBuffer);
+      CHECK_RESULT(status, "clSetKernelArg failed. (inputBuffer)");
+
+      status |= _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
+                                         (void *)&_outputBuffer);
+      CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)");
+      break;
+    default:
+      CHECK_RESULT(true, "Atomic type not supported (clSetKernelArg)");
+  }
+}
+
+// Since we write multiple times to the output in global atomics, need to
+// reset the content every time.
+void OCLPerfAtomicSpeed20::ResetGlobalOutput() {
+  cl_int status;
+
+  memset(_output, 0, _outputNBytes);
+
+  status =
+      _wrapper->clEnqueueWriteBuffer(cmd_queue_, _outputBuffer, CL_TRUE, 0,
+                                     _outputNBytes, _output, 0, NULL, NULL);
+  CHECK_RESULT(status, "clEnqueueWriteBuffer failed.");
+
+  status = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+}
+
+// Run the global histogram kernel.
+void OCLPerfAtomicSpeed20::RunGlobalHistogram(AtomicType atomicType) {
+  cl_uint status;
+  size_t globalThreads[3] = {1};
+  size_t localThreads[3] = {1};
+
+  globalThreads[0] = _inputNBytes / sizeof(cl_uint);
+  localThreads[0] = _nThreadsPerGroup;
+
+  if ((Global4WGReduction == atomicType) ||
+      (Global4AllToZeroReduction == atomicType))
+    globalThreads[0] /= 4;
+
+  status = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, _kernels[0], 1, NULL,
+                                            globalThreads, localThreads, 0,
+                                            NULL, NULL);
+  CHECK_RESULT(status, "clEnqueueNDRangeKernel failed.");
+
+  status = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+}
+
+// Run the AtomicSpeed logic.
+void OCLPerfAtomicSpeed20::run() {
+  if (skip_) {
+    return;
+  }
+
+#if defined(CL_VERSION_2_0)
+  int Arg = 0;
+  cl_uint status;
+  AtomicType atomicType = testOCLPerfAtomicSpeed20List[_openTest].atomicType;
+
+  // Verify atomics are supported.
+  if ((!_atomicsSupported) || (_dataSizeTooBig)) return;
+
+  // Write data to the GPU
+  status = _wrapper->clEnqueueWriteBuffer(cmd_queue_, _inputBuffer, CL_FALSE, 0,
+                                          _inputNBytes, _input, 0, NULL, NULL);
+  CHECK_RESULT(status, "clEnqueueWriteBuffer failed. (inputBuffer)");
+
+  status = _wrapper->clFlush(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+
+  // Set the current arguments based on the test type.
+  SetKernelArguments(atomicType);
+
+  // Run the kernels.
+  CPerfCounter timer;
+  double totalTime = 0.0f;
+
+  for (unsigned int k = 0; k < _numLoops + 1; k++) {
+    // Since we run multiple times using global atomics the output
+    // would get accumulated therefore first clean it.
+    ResetGlobalOutput();
+
+    timer.Reset();
+    timer.Start();
+    switch (atomicType) {
+      case GlobalWGReduction:
+      case Global4WGReduction:
+      case GlobalAllToZeroReduction:
+      case Global4AllToZeroReduction:
+        RunGlobalHistogram(atomicType);
+        break;
+      default:
+        CHECK_RESULT(true, "Atomic type not supported");
+    }
+    timer.Stop();
+    // Don't count the warm-up
+    if (0 != k) totalTime += timer.GetElapsedTime();
+  }
+
+  status = _wrapper->clEnqueueReadBuffer(cmd_queue_, _outputBuffer, CL_FALSE, 0,
+                                         _outputNBytes, _output, 0, NULL, NULL);
+  CHECK_RESULT(status, "clEnqueueReadBuffer failed.");
+  status = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+
+  // Print the results.
+  PrintResults(atomicType, totalTime);
+
+  // Check the results for the current test.
+  _errorFlag = !(VerifyResults(atomicType));
+#endif
+}
+
+// Compare the results and see if they match
+bool OCLPerfAtomicSpeed20::VerifyResults(const AtomicType atomicType) {
+  cl_uint i = 0;
+  bool flag = true;
+  cl_uint calculatedValue = 0;
+  cl_uint reductionElementCount = 0;
+  switch (atomicType) {
+    case GlobalWGReduction:
+    case Global4WGReduction:
+      reductionElementCount =
+          _inputNBytes / sizeof(cl_uint) / _nThreadsPerGroup;
+      for (i = 0; i < reductionElementCount; i++) {
+        calculatedValue += _output[i];
+      }
+      flag = (calculatedValue == _cpuReductionSum);
+      break;
+    case GlobalAllToZeroReduction:
+    case Global4AllToZeroReduction:
+      flag = (_output[0] == _cpuReductionSum);
+      break;
+    default:
+      CHECK_RESULT_NO_RETURN(true, "Atomic type not supported (VerifyResults)");
+      return false;
+  }
+  if (!flag) printf("WRONG VALUES!!!!!");
+  return flag;
+}
+
+unsigned int OCLPerfAtomicSpeed20::close() {
+  size_t i = 0;
+  for (; i < _kernels.size(); i++) {
+    error_ = _wrapper->clReleaseKernel(_kernels[i]);
+  }
+  for (; i < _programs.size(); i++) {
+    error_ = _wrapper->clReleaseProgram(_programs[i]);
+  }
+
+  if (_inputBuffer) {
+    error_ = clReleaseMemObject(_inputBuffer);
+    CHECK_RESULT_NO_RETURN(error_, "clReleaseMemObject failed.(inputBuffer )");
+  }
+  if (_outputBuffer) {
+    error_ = clReleaseMemObject(_outputBuffer);
+    CHECK_RESULT_NO_RETURN(error_, "clReleaseMemObject failed.(outputBuffer)");
+  }
+
+  // Free host memory.
+  free(_input);
+  free(_output);
+
+  // Reset everything.
+  _kernels.clear();
+  _programs.clear();
+
+  _inputBuffer = NULL;
+  _outputBuffer = NULL;
+
+  _input = NULL;
+  _output = NULL;
+
+  return OCLTestImp::close();
+}
+
+/* Helper functions */
+void OCLPerfAtomicSpeed20::calculateHostBin() {
+  // compute CPU histogram
+  cl_int *p = (cl_int *)_input;
+  memset(_cpuhist, 0, NBINS * sizeof(cl_uint));
+  _cpuReductionSum = 0;
+
+  for (unsigned int i = 0; i < _inputNBytes / sizeof(cl_uint); i++) {
+    _cpuhist[(p[i] >> 24) & 0xff]++;
+    _cpuhist[(p[i] >> 16) & 0xff]++;
+    _cpuhist[(p[i] >> 8) & 0xff]++;
+    _cpuhist[(p[i] >> 0) & 0xff]++;
+    _cpuReductionSum += ((p[i] >> 24) & 0x3) + ((p[i] >> 16) & 0x3) +
+                        ((p[i] >> 8) & 0x3) + ((p[i] >> 0) & 0x3);
+  }
+}
+
+void OCLPerfAtomicSpeed20::setupHistogram() {
+  cl_int status = 0;
+
+  _nThreads = 64 * 1024;
+  _n4Vectors = 2048 * 2048;
+  _n4Vectors *= _nCurrentInputScale;
+  _n4VectorsPerThread = _n4Vectors / _nThreads;
+  _inputNBytes = _n4Vectors * sizeof(cl_uint4);
+
+  _input = (cl_uint *)malloc(_inputNBytes);
+  if (0 == _input) {
+    _dataSizeTooBig = true;
+    return;
+  }
+
+  // random initialization of input
+  time_t ltime;
+  time(&ltime);
+  cl_uint a = (cl_uint)ltime, b = (cl_uint)ltime;
+  cl_uint *p = (cl_uint *)_input;
+
+  for (unsigned int i = 0; i < _inputNBytes / sizeof(cl_uint); i++)
+    p[i] = (b = (a * (b & 65535)) + (b >> 16));
+}
+
+// Print the results of the current test.
+void OCLPerfAtomicSpeed20::PrintResults(const AtomicType atomicType,
+                                        double totalTime) {
+  char buf[500];
+  char sAtomicType[100];
+  double inputInGB = (double)_inputNBytes * (double)(1e-09);
+  // each cl_uint in _inputNBytes contributes 4 items.
+  double totalHistogramDataInGB = (double)inputInGB * 4;
+  double perf = totalTime / _numLoops;
+
+  switch (atomicType) {
+    case GlobalWGReduction:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType), "Global work-group reduction");
+      break;
+    case Global4WGReduction:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType),
+               "Global vec 4 work-group reduction");
+      break;
+    case GlobalAllToZeroReduction:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType),
+               "Global all to zero reduction");
+      break;
+    case Global4AllToZeroReduction:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType),
+               "Global vec 4 all to zero reduction");
+      break;
+    default:
+      CHECK_RESULT(true, "Atomic type not supported (PrintResults)");
+  }
+
+  SNPRINTF(buf, sizeof(buf), "%45s: Input [%.3f GB], Time [%.3f sec]: GB/s",
+           sAtomicType, totalHistogramDataInGB, perf);
+  _perfInfo = (float)(totalHistogramDataInGB / perf);
+  testDescString = buf;
+}
@@ -0,0 +1,102 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_AtomicSpeed20_H_
+#define _OCL_AtomicSpeed20_H_
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "OCLTestImp.h"
+
+#define DEFAULT_WG_SIZE 256
+#define NBINS 256
+#define BITS_PER_PIX 8
+#define NBANKS 16
+
+#include "OCLPerfAtomicSpeed.h"
+
+typedef struct {
+  AtomicType atomicType;
+  int inputScale;
+} testOCLPerfAtomicSpeed20Struct;
+
+// Define the OCLPerfAtomicSpeed20 class.
+class OCLPerfAtomicSpeed20 : public OCLTestImp {
+ public:
+  OCLPerfAtomicSpeed20();
+  virtual ~OCLPerfAtomicSpeed20();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  cl_command_queue cmd_queue_;
+  std::vector<cl_program> _programs;
+  std::vector<cl_kernel> _kernels;
+
+  bool _atomicsSupported;
+  bool _dataSizeTooBig;
+  cl_uint _numLoops;
+
+  // Histogram related stuff...
+ private:
+  cl_ulong _maxMemoryAllocationSize;
+  cl_uint _inputNBytes;
+  cl_uint _outputNBytes;
+
+  cl_uint _nCurrentInputScale;
+  cl_uint _workgroupSize;
+  //    cl_uint nLoops;
+  cl_uint _nThreads;
+  cl_uint _nThreadsPerGroup;
+  cl_uint _nGroups;
+  cl_uint _n4Vectors;
+  cl_uint _n4VectorsPerThread;
+  cl_uint _nBins;
+  cl_uint _nBytesLDSPerGrp;
+
+  cl_uint* _input;
+  cl_uint* _output;
+  cl_mem _inputBuffer;
+  cl_mem _outputBuffer;
+  bool skip_;
+
+  cl_uint _cpuhist[NBINS];
+  cl_uint _cpuReductionSum;
+
+  void calculateHostBin();
+  void setupHistogram();
+  bool VerifyResults(const AtomicType atomicType);
+  void ResetGlobalOutput();
+
+  // Methods that does the actual NDRange.
+  void RunGlobalHistogram(const AtomicType atomicType);
+
+  void CreateKernels(const AtomicType atomicType);
+  void SetKernelArguments(const AtomicType atomicType);
+  void PrintResults(const AtomicType atomicType, double totalTime);
+};
+
+#endif  // _OCL_AtomicSpeed20_H_
@@ -0,0 +1,73 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+static const char *global_atomics_sum_reduction_all_to_zero =
+    "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+    " __kernel void global_atomics_sum_reduction_all_to_zero(uint "
+    "ItemsPerThread, __global uint *Input, __global atomic_int *Output )\n"
+    "{\n"
+    "    uint sum = 0;\n"
+    "    const uint msk =  (uint)3;\n"
+    "    const uint shft = (uint)8;\n"
+    "    \n"
+    "    uint tid = get_global_id(0);\n"
+    "    uint Stride  = get_global_size(0);\n"
+    "    for( int i = 0; i < ItemsPerThread; i++)\n"
+    "    {\n"
+    "       uint data = Input[tid];\n"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       tid += Stride;\n"
+    "    }\n"
+    "    atomic_fetch_add_explicit( &(Output[0]), sum, memory_order_relaxed, "
+    "memory_scope_device);\n"
+    "}\n";
+
+static const char *global_atomics_sum_reduction_workgroup =
+    "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+    " __kernel void global_atomics_sum_reduction_workgroup(uint "
+    "ItemsPerThread, __global uint *Input, __global atomic_int *Output )\n"
+    "{\n"
+    "    uint sum = 0;\n"
+    "    const uint msk =  (uint)3;\n"
+    "    const uint shft = (uint)8;\n"
+    "    \n"
+    "    uint tid = get_global_id(0);\n"
+    "    uint Stride  = get_global_size(0);\n"
+    "    for( int i = 0; i < ItemsPerThread; i++)\n"
+    "    {\n"
+    "       uint data = Input[tid];\n"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       tid += Stride;\n"
+    "    }\n"
+    "    atomic_fetch_add_explicit( &(Output[get_group_id(0)]), sum, "
+    "memory_order_relaxed, memory_scope_device);\n"
+    "}\n";
@@ -0,0 +1,402 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+static const char *local_atomics_histogram =
+    "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+    "#define MIN(a,b) ((a) < (b)) ? (a) : (b) \n"
+    "#define MAX(a,b) ((a) > (b)) ? (a) : (b) \n"
+    "__kernel __attribute__((reqd_work_group_size(256,1,1)))\n"
+    "void local_atomics_histogram(__global uint4 *Image,\n"
+    "__global uint  *Histogram,\n"
+    "uint  n4VectorsPerThread)\n"
+    "{\n"
+    "    __local __attribute__((aligned(16))) uint subhists[NBANKS * NBINS];\n"
+    "\n"
+    "    uint tid     = get_global_id(0);\n"
+    "    uint ltid    = get_local_id(0);\n"
+    "    uint Stride  = get_global_size(0);\n"
+    "\n"
+    "    uint i, idx;\n"
+    "    uint4 temp, temp2;\n"
+    "    const uint shft = (uint) BITS_PER_PIX;\n"
+    "    const uint msk =  (uint) (NBINS-1);\n"
+    "    uint offset = (uint) ltid % (uint) (NBANKS);\n"
+    "\n"
+    "    uint lmem_items = NBANKS * NBINS;\n"
+    "    uint lmem_items_per_thread;\n"
+    "    uint lmem_max_threads;\n"
+    "\n"
+    "    // parallel LDS clear\n"
+    "    // first, calculate threads per item, at least 1:\n"
+    "    lmem_max_threads = MIN( 1, get_local_size(0) / lmem_items );\n"
+    "    // but no more than we have items:\n"
+    "    lmem_max_threads = MAX( 1, lmem_max_threads / lmem_items );\n"
+    "    // calculate threads total:\n"
+    "    lmem_max_threads = lmem_items / lmem_max_threads;\n"
+    "    // but no more than LDS banks:\n"
+    "    lmem_max_threads = MIN( get_local_size(0), lmem_max_threads );\n"
+    "\n"
+    "    lmem_items_per_thread = lmem_items / lmem_max_threads;\n"
+    "\n"
+    "    // now, clear LDS\n"
+    "    __local uint4 *p = (__local uint4 *) subhists;\n"
+    "\n"
+    "    if( ltid < lmem_max_threads )\n"
+    "    {\n"
+    "        for(i=0, idx=ltid; i<lmem_items_per_thread/4; i++, "
+    "idx+=lmem_max_threads)\n"
+    "        {\n"
+    "            p[idx] = 0;\n"
+    "        }\n"
+    "    }\n"
+    "\n"
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "\n"
+    "    // read & scatter phase\n"
+    "\n"
+    "    for( i=0, idx=tid; i<n4VectorsPerThread; i++, idx += Stride )\n"
+    "    {\n"
+    "        temp = Image[idx];\n"
+    "        temp2 = (temp & msk) * (uint4) NBANKS + offset;\n"
+    "\n"
+    "        (void) atom_inc( subhists + temp2.x );\n"
+    "        (void) atom_inc( subhists + temp2.y );\n"
+    "        (void) atom_inc( subhists + temp2.z );\n"
+    "        (void) atom_inc( subhists + temp2.w );\n"
+    "\n"
+    "        temp = temp >> shft;\n"
+    "        temp2 = (temp & msk) * (uint4) NBANKS + offset;\n"
+    "\n"
+    "        (void) atom_inc( subhists + temp2.x );\n"
+    "        (void) atom_inc( subhists + temp2.y );\n"
+    "        (void) atom_inc( subhists + temp2.z );\n"
+    "        (void) atom_inc( subhists + temp2.w );\n"
+    "\n"
+    "        temp = temp >> shft;\n"
+    "        temp2 = (temp & msk) * (uint4) NBANKS + offset;\n"
+    "\n"
+    "        (void) atom_inc( subhists + temp2.x );\n"
+    "        (void) atom_inc( subhists + temp2.y );\n"
+    "        (void) atom_inc( subhists + temp2.z );\n"
+    "        (void) atom_inc( subhists + temp2.w );\n"
+    "\n"
+    "        temp = temp >> shft;\n"
+    "        temp2 = (temp & msk) * (uint4) NBANKS + offset;\n"
+    "\n"
+    "        (void) atom_inc( subhists + temp2.x );\n"
+    "        (void) atom_inc( subhists + temp2.y );\n"
+    "        (void) atom_inc( subhists + temp2.z );\n"
+    "        (void) atom_inc( subhists + temp2.w );\n"
+    "    }\n"
+    "\n"
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "\n"
+    "    // reduce __local banks to single histogram per work-group\n"
+    "\n"
+    "    if( ltid < NBINS )\n"
+    "    {\n"
+    "        uint bin = 0;\n"
+    "        for( i=0; i<NBANKS; i++ )\n"
+    "        {\n"
+    "            bin += subhists[ (ltid * NBANKS) + i ];\n"
+    "        }\n"
+    "        Histogram[ (get_group_id(0) * NBINS) + ltid ] = bin;\n"
+    "    }\n"
+    "}\n";
+
+static const char *local_atomics_reduce =
+    " __kernel void local_atomics_reduce( __global uint *Histogram, uint "
+    "nSubHists )\n"
+    "{\n"
+    "    uint tid = get_global_id(0);\n"
+    "    uint bin = 0;\n"
+    "    // Reduce work-group histograms into single histogram,\n"
+    "    // one thread for each bin.\n"
+    "    for( int i=0; i < nSubHists; i++ )\n"
+    "        bin += Histogram[ (i * NBINS) + tid ];\n"
+    "    Histogram[ tid ] = bin;\n"
+    "}\n";
+
+static const char *global_atomics_histogram =
+    "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+    "__kernel __attribute__((reqd_work_group_size(256,1,1)))\n"
+    "void global_atomics_histogram(uint ItemsPerThread,\n"
+    "__global uint *Input,\n"
+    "__global uint  *Histogram)\n"
+    "{\n"
+    "   uint tid = get_global_id(0);\n"
+    "   const uint shft = (uint) BITS_PER_PIX;\n"
+    "   const uint msk =  (uint) (NBINS-1);\n"
+    "   uint Stride  = get_global_size(0);\n"
+    "   for( int i = 0; i < ItemsPerThread; i++)\n"
+    "   {\n"
+    "       uint temp  = Input[tid];\n"
+    "       atom_inc( &(Histogram[ (temp & msk) ]) );\n"
+    "       temp = temp >> shft;\n"
+    "       atom_inc( &(Histogram[ (temp & msk) ]) );\n"
+    "       temp = temp >> shft;\n"
+    "       atom_inc( &(Histogram[ (temp & msk) ]) );\n"
+    "       temp = temp >> shft;\n"
+    "       atom_inc( &(Histogram[ (temp & msk) ]) );\n"
+    "       tid += Stride;"
+    "   }\n"
+    "}\n";
+
+static const char *global_vec4_atomics_histogram =
+    "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+    "__kernel __attribute__((reqd_work_group_size(256,1,1)))\n"
+    "void global_atomics_histogram(uint ItemsPerThread,\n"
+    "__global uint4 *Input,\n"
+    "__global uint  *Histogram)\n"
+    "{\n"
+    "   uint tid = get_global_id(0);\n"
+    "   const uint shft = (uint) BITS_PER_PIX;\n"
+    "   const uint msk =  (uint) (NBINS-1);\n"
+    "   uint Stride  = get_global_size(0);\n"
+    "   for( int i = 0; i < ItemsPerThread; i++)\n"
+    "   {\n"
+    "       uint4 temp  = Input[tid];\n"
+    "       atom_inc( &(Histogram[ (temp.x & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.y & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.z & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.w & msk) ]) );\n"
+    "       temp = temp >> shft;\n"
+    "       atom_inc( &(Histogram[ (temp.x & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.y & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.z & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.w & msk) ]) );\n"
+    "       temp = temp >> shft;\n"
+    "       atom_inc( &(Histogram[ (temp.x & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.y & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.z & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.w & msk) ]) );\n"
+    "       temp = temp >> shft;\n"
+    "       atom_inc( &(Histogram[ (temp.x & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.y & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.z & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.w & msk) ]) );\n"
+    "       tid += Stride;"
+    "   }\n"
+    "}\n";
+
+static const char *global_atomics_sum_reduction_all_to_zero =
+    "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+    " __kernel void global_atomics_sum_reduction_all_to_zero(uint "
+    "ItemsPerThread, __global uint *Input, __global int *Output )\n"
+    "{\n"
+    "    uint sum = 0;\n"
+    "    const uint msk =  (uint)3;\n"
+    "    const uint shft = (uint)8;\n"
+    "    \n"
+    "    uint tid = get_global_id(0);\n"
+    "    uint Stride  = get_global_size(0);\n"
+    "    for( int i = 0; i < ItemsPerThread; i++)\n"
+    "    {\n"
+    "       uint data = Input[tid];\n"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       tid += Stride;\n"
+    "    }\n"
+    "    atom_add( &(Output[0]), sum);\n"
+    "}\n";
+
+static const char *global_atomics_sum_reduction_workgroup =
+    "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+    " __kernel void global_atomics_sum_reduction_workgroup(uint "
+    "ItemsPerThread, __global uint *Input, __global int *Output )\n"
+    "{\n"
+    "    uint sum = 0;\n"
+    "    const uint msk =  (uint)3;\n"
+    "    const uint shft = (uint)8;\n"
+    "    \n"
+    "    uint tid = get_global_id(0);\n"
+    "    uint Stride  = get_global_size(0);\n"
+    "    for( int i = 0; i < ItemsPerThread; i++)\n"
+    "    {\n"
+    "       uint data = Input[tid];\n"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       tid += Stride;\n"
+    "    }\n"
+    "    atom_add( &(Output[get_group_id(0)]), sum);\n"
+    "}\n";
+
+static const char *local_reduction =
+    "__kernel void local_reduction(__global uint* input, __global uint* "
+    "output, __local uint* sdata)\n"
+    "{\n"
+    "   // load shared mem\n"
+    "   const uint msk =  (uint)3;\n"
+    "   const uint shft = (uint)8;\n"
+    "   unsigned int tid = get_local_id(0);\n"
+    "\n"
+    "   unsigned int localSize = get_local_size(0);\n"
+    "   unsigned int stride = get_global_id(0) * 2;\n"
+    "   unsigned int data1 = input[stride];\n"
+    "   unsigned int data2 = input[stride + 1];\n"
+    "   unsigned int sum = 0;\n"
+    "   for( int i = 0; i < 4; i++)\n"
+    "   {\n"
+    "       sum += (data1 & msk) + (data2 & msk);\n"
+    "       data1 = data1 >> shft;\n"
+    "       data2 = data2 >> shft;\n"
+    "   }\n"
+    "   sdata[tid] = sum;"
+    "\n"
+    "   barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "   // do reduction in shared mem\n"
+    "   for(unsigned int s = localSize >> 1; s > 0; s >>= 1)\n"
+    "   {\n"
+    "       if(tid < s) \n"
+    "       {\n"
+    "           sdata[tid] += sdata[tid + s];\n"
+    "       }\n"
+    "       barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "   }\n"
+    "\n"
+    "   // write result for this block to global mem\n"
+    "   if(tid == 0) output[get_group_id(0)] = sdata[0];\n"
+    "}\n";
+
+static const char *local_vec4_reduction =
+    "__kernel void local_reduction(__global uint4* input, __global uint4* "
+    "output, __local uint4* sdata)\n"
+    "{\n"
+    "   // load shared mem\n"
+    "   const uint msk =  (uint)3;\n"
+    "   const uint shft = (uint)8;\n"
+    "   unsigned int tid = get_local_id(0);\n"
+    "\n"
+    "   unsigned int localSize = get_local_size(0);\n"
+    "   unsigned int stride = get_global_id(0) * 2;\n"
+    "   uint4 data1 = input[stride];\n"
+    "   uint4 data2 = input[stride + 1];\n"
+    "   uint4 sum = 0;\n"
+    "   for( int i = 0; i < 4; i++)\n"
+    "   {\n"
+    "       sum += (data1 & msk) + (data2 & msk);\n"
+    "       data1 = data1 >> shft;\n"
+    "       data2 = data2 >> shft;\n"
+    "   }\n"
+    "   sdata[tid] = sum;"
+    "\n"
+    "   barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "   // do reduction in shared mem\n"
+    "   for(unsigned int s = localSize >> 1; s > 0; s >>= 1)\n"
+    "   {\n"
+    "       if(tid < s) \n"
+    "       {\n"
+    "           sdata[tid] += sdata[tid + s];\n"
+    "       }\n"
+    "       barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "   }\n"
+    "\n"
+    "   // write result for this block to global mem\n"
+    "   if(tid == 0) output[get_group_id(0)] = sdata[0];\n"
+    "}\n";
+
+static const char *local_atomics_reduction =
+    "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+    "__kernel void local_reduction(__global uint* input, __global uint* "
+    "output, __local uint* sdata)\n"
+    "{\n"
+    "   // load shared mem\n"
+    "   const uint msk =  (uint)3;\n"
+    "   const uint shft = (uint)8;\n"
+    "   unsigned int tid = get_local_id(0);\n"
+    "\n"
+    "   unsigned int localSize = get_local_size(0);\n"
+    "   unsigned int stride = get_global_id(0) * 2;\n"
+    "   unsigned int data1 = input[stride];\n"
+    "   unsigned int data2 = input[stride + 1];\n"
+    "   unsigned int sum = 0;\n"
+    "   for( int i = 0; i < 4; i++)\n"
+    "   {\n"
+    "       sum += (data1 & msk) + (data2 & msk);\n"
+    "       data1 = data1 >> shft;\n"
+    "       data2 = data2 >> shft;\n"
+    "   }\n"
+    "   sdata[tid] = sum;"
+    "\n"
+    "   barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "   // do reduction in shared mem\n"
+    "   for(unsigned int s = localSize >> 1; s > 0; s >>= 1)\n"
+    "   {\n"
+    "       if(tid < s) \n"
+    "       {\n"
+    "           atom_add( &(sdata[tid]), sdata[tid + s]);\n"
+    "       }\n"
+    "       barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "   }\n"
+    "\n"
+    "   // write result for this block to global mem\n"
+    "   if(tid == 0) output[get_group_id(0)] = sdata[0];\n"
+    "}\n";
+
+static const char *local_vec4_atomics_reduction =
+    "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+    "__kernel void local_reduction(__global uint4* input, __global uint4* "
+    "output, __local uint4* sdata)\n"
+    "{\n"
+    "   // load shared mem\n"
+    "   const uint msk =  (uint)3;\n"
+    "   const uint shft = (uint)8;\n"
+    "   unsigned int tid = get_local_id(0);\n"
+    "\n"
+    "   unsigned int localSize = get_local_size(0);\n"
+    "   unsigned int stride = get_global_id(0) * 2;\n"
+    "   uint4 data1 = input[stride];\n"
+    "   uint4 data2 = input[stride + 1];\n"
+    "   uint4 sum = 0;\n"
+    "   for( int i = 0; i < 4; i++)\n"
+    "   {\n"
+    "       sum += (data1 & msk) + (data2 & msk);\n"
+    "       data1 = data1 >> shft;\n"
+    "       data2 = data2 >> shft;\n"
+    "   }\n"
+    "   sdata[tid] = sum;"
+    "\n"
+    "   barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "   // do reduction in shared mem\n"
+    "   for(unsigned int s = localSize >> 1; s > 0; s >>= 1)\n"
+    "   {\n"
+    "       if(tid < s) \n"
+    "       {\n"
+    "           atom_add( &(sdata[tid]).x, sdata[tid + s].x);\n"
+    "           atom_add( &(sdata[tid]).y, sdata[tid + s].y);\n"
+    "           atom_add( &(sdata[tid]).z, sdata[tid + s].z);\n"
+    "           atom_add( &(sdata[tid]).w, sdata[tid + s].w);\n"
+    "       }\n"
+    "       barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "   }\n"
+    "\n"
+    "   // write result for this block to global mem\n"
+    "   if(tid == 0) output[get_group_id(0)] = sdata[0];\n"
+    "}\n";
@@ -0,0 +1,254 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfBufferCopyOverhead.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <complex>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+typedef struct {
+  unsigned int iterations;
+  int flushEvery;
+} testStruct;
+
+static testStruct testList[] = {
+    {1, -1},         {1, -1},      {10, 1},      {10, -1},      {100, 1},
+    {100, 10},       {100, -1},    {1000, 1},    {1000, 10},    {1000, 100},
+    {1000, -1},      {10000, 1},   {10000, 10},  {10000, 100},  {10000, 1000},
+    {10000, -1},     {100000, 1},  {100000, 10}, {100000, 100}, {100000, 1000},
+    {100000, 10000}, {100000, -1},
+};
+
+OCLPerfBufferCopyOverhead::OCLPerfBufferCopyOverhead() {
+  _numSubTests = 2 * 2 * sizeof(testList) / sizeof(testStruct);
+}
+
+OCLPerfBufferCopyOverhead::~OCLPerfBufferCopyOverhead() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfBufferCopyOverhead::open(unsigned int test, char *units,
+                                     double &conversion,
+                                     unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test % (sizeof(testList) / sizeof(testStruct));
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  srcBuffer_ = 0;
+  dstBuffer_ = 0;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    delete platforms;
+  }
+
+  bufSize_ = 4;
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_READ_ONLY;
+  sleep = ((test / (sizeof(testList) / sizeof(testStruct))) % 2) > 0;
+  if (test >= ((sizeof(testList) / sizeof(testStruct)) * 2)) {
+    srcHost = true;
+    flags |= CL_MEM_ALLOC_HOST_PTR;
+  } else {
+    srcHost = false;
+  }
+  srcBuffer_ =
+      _wrapper->clCreateBuffer(context_, flags, bufSize_, NULL, &error_);
+  CHECK_RESULT(srcBuffer_ == 0, "clCreateBuffer(srcBuffer) failed");
+
+  flags = CL_MEM_WRITE_ONLY;
+  if (!srcHost) {
+    flags |= CL_MEM_ALLOC_HOST_PTR;
+  }
+  dstBuffer_ =
+      _wrapper->clCreateBuffer(context_, flags, bufSize_, NULL, &error_);
+  CHECK_RESULT(dstBuffer_ == 0, "clCreateBuffer(dstBuffer) failed");
+}
+
+void OCLPerfBufferCopyOverhead::run(void) {
+  CPerfCounter timer;
+  cl_event event;
+  cl_int eventStatus;
+  unsigned int iter = testList[_openTest].iterations;
+
+  // Warm up
+  error_ = _wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, dstBuffer_, 0,
+                                         0, bufSize_, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueCopyBuffer failed");
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < iter; i++) {
+    error_ = _wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, dstBuffer_,
+                                           0, 0, bufSize_, 0, NULL, &event);
+
+    CHECK_RESULT(error_, "clEnqueueCopyBuffer failed");
+    if ((testList[_openTest].flushEvery > 0) &&
+        (((i + 1) % testList[_openTest].flushEvery) == 0)) {
+      if (sleep) {
+        _wrapper->clFinish(cmd_queue_);
+      } else {
+        _wrapper->clFlush(cmd_queue_);
+        error_ =
+            _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                     sizeof(cl_int), &eventStatus, NULL);
+        while (eventStatus > 0) {
+          error_ =
+              _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                       sizeof(cl_int), &eventStatus, NULL);
+        }
+      }
+    }
+    if (i != (iter - 1)) {
+      _wrapper->clReleaseEvent(event);
+    }
+  }
+  if (sleep) {
+    _wrapper->clFinish(cmd_queue_);
+  } else {
+    _wrapper->clFlush(cmd_queue_);
+    error_ = _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                      sizeof(cl_int), &eventStatus, NULL);
+    while (eventStatus > 0) {
+      error_ =
+          _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                   sizeof(cl_int), &eventStatus, NULL);
+    }
+  }
+  _wrapper->clReleaseEvent(event);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Buffer copy time in us
+  double perf = sec * 1000. * 1000. / iter;
+
+  const char *strSrc = NULL;
+  const char *strDst = NULL;
+  const char *strWait = NULL;
+  if (srcHost) {
+    strSrc = "host";
+    strDst = "dev";
+  } else {
+    strSrc = "dev";
+    strDst = "host";
+  }
+  if (sleep) {
+    strWait = "sleep";
+  } else {
+    strWait = "spin";
+  }
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " %5s, s:%4s d:%4s i:%6d (us) ", strWait, strSrc,
+           strDst, iter);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfBufferCopyOverhead::close(void) {
+  if (srcBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(srcBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(srcBuffer_) failed");
+  }
+  if (dstBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(dstBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(dstBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
@@ -0,0 +1,50 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_BufferCopyOverhead_H_
+#define _OCL_BufferCopyOverhead_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfBufferCopyOverhead : public OCLTestImp {
+ public:
+  OCLPerfBufferCopyOverhead();
+  virtual ~OCLPerfBufferCopyOverhead();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 1000;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem srcBuffer_;
+  cl_mem dstBuffer_;
+  cl_int error_;
+
+  unsigned int bufSize_;
+  bool sleep;
+  bool srcHost;
+};
+
+#endif  // _OCL_BufferCopyOverhead_H_
@@ -0,0 +1,439 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfBufferCopySpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <complex>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 8
+// 4KB, 8KB, 64KB, 256KB, 1 MB, 4MB, 16 MB, 16MB+10
+static const unsigned int Sizes[NUM_SIZES] = {
+    4096, 8192, 65536, 262144, 1048576, 4194304, 16777216, 16777216 + 10};
+
+static const unsigned int Iterations[2] = {1, OCLPerfBufferCopySpeed::NUM_ITER};
+
+#define BUF_TYPES 4
+//  16 ways to combine 4 different buffer types
+#define NUM_SUBTESTS (BUF_TYPES * BUF_TYPES)
+
+OCLPerfBufferCopySpeed::OCLPerfBufferCopySpeed() {
+  _numSubTests = NUM_SIZES * NUM_SUBTESTS * 2;
+}
+
+OCLPerfBufferCopySpeed::~OCLPerfBufferCopySpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfBufferCopySpeed::setData(void *ptr, unsigned int size,
+                                     unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  value = 0;
+  for (unsigned int i = 0; i < size >> 2; i++) {
+    ptr2[i] = value;
+    value++;
+  }
+}
+
+void OCLPerfBufferCopySpeed::checkData(void *ptr, unsigned int size,
+                                       unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  value = 0;
+  for (unsigned int i = 0; i < size >> 2; i++) {
+    if (ptr2[i] != value) {
+      printf("Data validation failed at %d!  Got 0x%08x 0x%08x 0x%08x 0x%08x\n",
+             i, ptr2[i], ptr2[i + 1], ptr2[i + 2], ptr2[i + 3]);
+      printf("Expected 0x%08x 0x%08x 0x%08x 0x%08x\n", value, value, value,
+             value);
+      CHECK_RESULT(true, "Data validation failed!");
+      break;
+    }
+    value++;
+  }
+}
+
+void OCLPerfBufferCopySpeed::open(unsigned int test, char *units,
+                                  double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  srcBuffer_ = 0;
+  dstBuffer_ = 0;
+  persistent[0] = false;
+  persistent[1] = false;
+  allocHostPtr[0] = false;
+  allocHostPtr[1] = false;
+  useHostPtr[0] = false;
+  useHostPtr[1] = false;
+  memptr[0] = NULL;
+  memptr[1] = NULL;
+  alignedmemptr[0] = NULL;
+  alignedmemptr[1] = NULL;
+  isAMD = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        isAMD = true;
+      }
+      // platform = platforms[_platformIndex];
+      // break;
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+
+  char getVersion[128];
+  error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION,
+                                       sizeof(getVersion), getVersion, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+  platformVersion[0] = getVersion[7];
+  platformVersion[1] = getVersion[8];
+  platformVersion[2] = getVersion[9];
+  platformVersion[3] = '\0';
+  bufSize_ = Sizes[_openTest % NUM_SIZES];
+  unsigned int srcTest = (_openTest / NUM_SIZES) % BUF_TYPES;
+  unsigned int dstTest = (_openTest / (NUM_SIZES * BUF_TYPES)) % BUF_TYPES;
+  if (srcTest == 3) {
+    useHostPtr[0] = true;
+  } else if ((srcTest == 2) && isAMD) {
+    persistent[0] = true;
+  } else if (srcTest == 1) {
+    allocHostPtr[0] = true;
+  }
+  if ((dstTest == 1) && isAMD) {
+    persistent[1] = true;
+  } else if (dstTest == 2) {
+    allocHostPtr[1] = true;
+  } else if (dstTest == 3) {
+    useHostPtr[1] = true;
+  }
+
+  numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS)];
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_READ_ONLY;
+  if (persistent[0]) {
+    flags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
+  } else if (allocHostPtr[0]) {
+    flags |= CL_MEM_ALLOC_HOST_PTR;
+  } else if (useHostPtr[0]) {
+    flags |= CL_MEM_USE_HOST_PTR;
+    memptr[0] = malloc(bufSize_ + 4096);
+    alignedmemptr[0] = (void *)(((size_t)memptr[0] + 4095) & ~4095);
+  }
+  srcBuffer_ = _wrapper->clCreateBuffer(context_, flags, bufSize_,
+                                        alignedmemptr[0], &error_);
+  CHECK_RESULT(srcBuffer_ == 0, "clCreateBuffer(srcBuffer) failed");
+  void *mem;
+  mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, srcBuffer_, CL_TRUE,
+                                     CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL,
+                                     &error_);
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  setData(mem, bufSize_, 0x600df00d);
+  _wrapper->clEnqueueUnmapMemObject(cmd_queue_, srcBuffer_, mem, 0, NULL, NULL);
+
+  flags = CL_MEM_WRITE_ONLY;
+  if (persistent[1]) {
+    flags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
+  } else if (allocHostPtr[1]) {
+    flags |= CL_MEM_ALLOC_HOST_PTR;
+  } else if (useHostPtr[1]) {
+    flags |= CL_MEM_USE_HOST_PTR;
+    memptr[1] = malloc(bufSize_ + 4096);
+    alignedmemptr[1] = (void *)(((size_t)memptr[1] + 4095) & ~4095);
+  }
+  dstBuffer_ = _wrapper->clCreateBuffer(context_, flags, bufSize_,
+                                        alignedmemptr[1], &error_);
+  CHECK_RESULT(dstBuffer_ == 0, "clCreateBuffer(dstBuffer) failed");
+
+  // Force persistent memory to be on GPU
+  if (persistent[0]) {
+    cl_mem memBuffer =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed");
+
+    _wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, dstBuffer_, 0, 0,
+                                  bufSize_, 0, NULL, NULL);
+    _wrapper->clFinish(cmd_queue_);
+
+    _wrapper->clReleaseMemObject(memBuffer);
+  }
+  if (persistent[1]) {
+    cl_mem memBuffer =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed");
+
+    _wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, memBuffer, 0, 0,
+                                  bufSize_, 0, NULL, NULL);
+    _wrapper->clFinish(cmd_queue_);
+
+    _wrapper->clReleaseMemObject(memBuffer);
+  }
+}
+
+void OCLPerfBufferCopySpeed::run(void) {
+  CPerfCounter timer;
+
+  // Warm up
+  error_ = _wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, dstBuffer_, 0,
+                                         0, bufSize_, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueCopyBuffer failed");
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < numIter; i++) {
+    error_ = _wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, dstBuffer_,
+                                           0, 0, bufSize_, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueCopyBuffer failed");
+  }
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Buffer copy bandwidth in GB/s
+  double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
+
+  void *mem;
+  mem =
+      _wrapper->clEnqueueMapBuffer(cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ,
+                                   0, bufSize_, 0, NULL, NULL, &error_);
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  checkData(mem, bufSize_, 0x600df00d);
+  _wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL);
+
+  const char *strSrc = NULL;
+  const char *strDst = NULL;
+  if (persistent[0])
+    strSrc = "per";
+  else if (allocHostPtr[0])
+    strSrc = "AHP";
+  else if (useHostPtr[0])
+    strSrc = "UHP";
+  else
+    strSrc = "dev";
+  if (persistent[1])
+    strDst = "per";
+  else if (allocHostPtr[1])
+    strDst = "AHP";
+  else if (useHostPtr[1])
+    strDst = "UHP";
+  else
+    strDst = "dev";
+  // Double results when src and dst are both on device
+  if ((persistent[0] || (!allocHostPtr[0] && !useHostPtr[0])) &&
+      (persistent[1] || (!allocHostPtr[1] && !useHostPtr[1])))
+    perf *= 2.0;
+  // Double results when src and dst are both in sysmem
+  if ((allocHostPtr[0] || useHostPtr[0]) && (allocHostPtr[1] || useHostPtr[1]))
+    perf *= 2.0;
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) s:%s d:%s i:%4d (GB/s) ", bufSize_,
+           strSrc, strDst, numIter);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfBufferCopySpeed::close(void) {
+  if (srcBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(srcBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(srcBuffer_) failed");
+  }
+  if (dstBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(dstBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(dstBuffer_) failed");
+  }
+  if (memptr[0]) {
+    free(memptr[0]);
+  }
+  if (memptr[1]) {
+    free(memptr[1]);
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
+
+void OCLPerfBufferCopyRectSpeed::run(void) {
+  CPerfCounter timer;
+  size_t width = static_cast<size_t>(sqrt(static_cast<float>(bufSize_)));
+  size_t srcOrigin[3] = {0, 0, 0};
+  size_t dstOrigin[3] = {0, 0, 0};
+  size_t region[3] = {width, width, 1};
+  // Clamp iteration count for non-local writes to shorten test runtime
+  unsigned int testNumIter = numIter;
+
+  if (allocHostPtr[1]) {
+    testNumIter = (numIter < 100 ? numIter : 100);
+  }
+
+  // Skip for 1.0 platforms
+  if ((platformVersion[0] == '1') && (platformVersion[2] == '0')) {
+    char buf[256];
+    SNPRINTF(buf, sizeof(buf), " SKIPPED ");
+    testDescString = buf;
+    return;
+  }
+  // Warm up
+  error_ = _wrapper->clEnqueueCopyBufferRect(cmd_queue_, srcBuffer_, dstBuffer_,
+                                             srcOrigin, dstOrigin, region,
+                                             width, 0, width, 0, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueCopyBufferRect failed");
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < testNumIter; i++) {
+    error_ = _wrapper->clEnqueueCopyBufferRect(
+        cmd_queue_, srcBuffer_, dstBuffer_, srcOrigin, dstOrigin, region, width,
+        0, width, 0, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueCopyBufferRect failed");
+  }
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Buffer copy bandwidth in GB/s
+  double perf = ((double)bufSize_ * testNumIter * (double)(1e-09)) / sec;
+
+  const char *strSrc = NULL;
+  const char *strDst = NULL;
+  if (persistent[0])
+    strSrc = "per";
+  else if (allocHostPtr[0])
+    strSrc = "AHP";
+  else if (useHostPtr[0])
+    strSrc = "UHP";
+  else
+    strSrc = "dev";
+  if (persistent[1])
+    strDst = "per";
+  else if (allocHostPtr[1])
+    strDst = "AHP";
+  else if (useHostPtr[1])
+    strDst = "UHP";
+  else
+    strDst = "dev";
+  // Double results when src and dst are both on device
+  if ((persistent[0] || (!allocHostPtr[0] && !useHostPtr[0])) &&
+      (persistent[1] || (!allocHostPtr[1] && !useHostPtr[1])))
+    perf *= 2.0;
+  // Double results when src and dst are both in sysmem
+  if ((allocHostPtr[0] || useHostPtr[0]) && (allocHostPtr[1] || useHostPtr[1]))
+    perf *= 2.0;
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) s:%s d:%s i:%4d (GB/s) ", bufSize_,
+           strSrc, strDst, testNumIter);
+  testDescString = buf;
+}
@@ -0,0 +1,65 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_BufferCopySpeed_H_
+#define _OCL_BufferCopySpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfBufferCopySpeed : public OCLTestImp {
+ public:
+  OCLPerfBufferCopySpeed();
+  virtual ~OCLPerfBufferCopySpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 1000;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem srcBuffer_;
+  cl_mem dstBuffer_;
+  cl_int error_;
+
+  unsigned int bufSize_;
+  bool persistent[2];
+  bool allocHostPtr[2];
+  bool useHostPtr[2];
+  unsigned int numIter;
+  bool isAMD;
+  char platformVersion[32];
+  void setData(void* ptr, unsigned int size, unsigned int value);
+  void checkData(void* ptr, unsigned int size, unsigned int value);
+  void* memptr[2];
+  void* alignedmemptr[2];
+};
+
+class OCLPerfBufferCopyRectSpeed : public OCLPerfBufferCopySpeed {
+ public:
+  OCLPerfBufferCopyRectSpeed() : OCLPerfBufferCopySpeed() {}
+
+ public:
+  virtual void run(void);
+};
+#endif  // _OCL_BufferCopySpeed_H_
@@ -0,0 +1,334 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfBufferReadSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <complex>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 8
+// 256KB, 1 MB, 4MB, 16 MB
+static const unsigned int Sizes[NUM_SIZES] = {
+    1024, 32 * 1024, 64 * 1024, 128 * 1024, 262144, 1048576, 4194304, 16777216};
+
+static cl_uint blockedSubtests;
+
+static const unsigned int Iterations[2] = {1, OCLPerfBufferReadSpeed::NUM_ITER};
+#define NUM_OFFSETS 1
+static const unsigned int offsets[NUM_OFFSETS] = {0};
+#define NUM_SUBTESTS (3 + NUM_OFFSETS)
+extern const char *blkStr[2];
+
+OCLPerfBufferReadSpeed::OCLPerfBufferReadSpeed() {
+  _numSubTests = NUM_SIZES * NUM_SUBTESTS * 2;
+  blockedSubtests = _numSubTests;
+  _numSubTests += NUM_SIZES * NUM_SUBTESTS;
+}
+
+OCLPerfBufferReadSpeed::~OCLPerfBufferReadSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfBufferReadSpeed::open(unsigned int test, char *units,
+                                  double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  outBuffer_ = 0;
+  persistent = false;
+  allocHostPtr = false;
+  useHostPtr = false;
+  hostMem = NULL;
+  alignedMem = NULL;
+  alignment = 4096;
+  isAMD = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        isAMD = true;
+      }
+      // platform = platforms[_platformIndex];
+      // break;
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  char getVersion[128];
+  error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION,
+                                       sizeof(getVersion), getVersion, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+  platformVersion[0] = getVersion[7];
+  platformVersion[1] = getVersion[8];
+  platformVersion[2] = getVersion[9];
+  platformVersion[3] = '\0';
+  bufSize_ = Sizes[_openTest % NUM_SIZES];
+
+  if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 2) {
+    useHostPtr = true;
+    offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 3];
+  } else if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 2) && isAMD) {
+    persistent = true;
+  } else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 1) {
+    allocHostPtr = true;
+  }
+
+  if (_openTest < blockedSubtests) {
+    numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS)];
+  } else {
+    numIter =
+        4 * OCLPerfBufferReadSpeed::NUM_ITER / ((_openTest % NUM_SIZES) + 1);
+  }
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_WRITE_ONLY;
+  if (persistent) {
+    flags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
+  } else if (allocHostPtr) {
+    flags |= CL_MEM_ALLOC_HOST_PTR;
+  } else if (useHostPtr) {
+    flags |= CL_MEM_USE_HOST_PTR;
+    hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset);
+    CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed");
+    alignedMem =
+        (char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) +
+                 offset);
+  }
+  outBuffer_ =
+      _wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  // Force memory to be on GPU if possible
+  {
+    cl_mem memBuffer =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed");
+
+    _wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, outBuffer_, 0, 0,
+                                  bufSize_, 0, NULL, NULL);
+    _wrapper->clFinish(cmd_queue_);
+
+    _wrapper->clReleaseMemObject(memBuffer);
+  }
+}
+
+void OCLPerfBufferReadSpeed::run(void) {
+  CPerfCounter timer;
+  char *mem = new char[bufSize_];
+  cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE;
+
+  // Warm up
+  error_ = _wrapper->clEnqueueReadBuffer(cmd_queue_, outBuffer_, CL_TRUE, 0,
+                                         bufSize_, mem, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueReadBuffer failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < numIter; i++) {
+    error_ = _wrapper->clEnqueueReadBuffer(cmd_queue_, outBuffer_, blocking, 0,
+                                           bufSize_, mem, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueReadBuffer failed");
+  }
+  if (blocking != CL_TRUE) {
+    _wrapper->clFinish(cmd_queue_);
+  }
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Buffer read bandwidth in GB/s
+  double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char str[256];
+  if (persistent) {
+    SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)");
+  } else if (allocHostPtr) {
+    SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)");
+  } else if (useHostPtr) {
+    SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset);
+  } else {
+    SNPRINTF(str, sizeof(str), "(GB/s)");
+  }
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %29s ", bufSize_,
+           blkStr[blocking], numIter, str);
+  testDescString = buf;
+
+  delete mem;
+}
+
+unsigned int OCLPerfBufferReadSpeed::close(void) {
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+  if (hostMem) {
+    free(hostMem);
+  }
+
+  return _crcword;
+}
+
+void OCLPerfBufferReadRectSpeed::run(void) {
+  CPerfCounter timer;
+  char *mem = new char[bufSize_];
+  size_t width = static_cast<size_t>(sqrt(static_cast<float>(bufSize_)));
+  size_t bufOrigin[3] = {0, 0, 0};
+  size_t hostOrigin[3] = {0, 0, 0};
+  size_t region[3] = {width, width, 1};
+  cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE;
+
+  // Clamp iterations to reduce run time
+  unsigned int testNumIter;
+  testNumIter = (numIter < 100 ? numIter : 100);
+
+  // Skip for 1.0 platforms
+  if ((platformVersion[0] == '1') && (platformVersion[2] == '0')) {
+    char buf[256];
+    SNPRINTF(buf, sizeof(buf), " SKIPPED ");
+    testDescString = buf;
+    return;
+  }
+  // Warm up
+  error_ = _wrapper->clEnqueueReadBufferRect(
+      cmd_queue_, outBuffer_, CL_TRUE, bufOrigin, hostOrigin, region, width, 0,
+      width, 0, mem, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueReadBufferRect failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < testNumIter; i++) {
+    error_ = _wrapper->clEnqueueReadBufferRect(
+        cmd_queue_, outBuffer_, blocking, bufOrigin, hostOrigin, region, width,
+        0, width, 0, mem, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueReadBufferRect failed");
+  }
+  if (blocking != CL_TRUE) {
+    _wrapper->clFinish(cmd_queue_);
+  }
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Buffer read bandwidth in GB/s
+  double perf = ((double)bufSize_ * testNumIter * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char str[256];
+  if (persistent) {
+    SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)");
+  } else if (allocHostPtr) {
+    SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)");
+  } else if (useHostPtr) {
+    SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset);
+  } else {
+    SNPRINTF(str, sizeof(str), "(GB/s)");
+  }
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %29s ", bufSize_,
+           blkStr[blocking], numIter, str);
+  testDescString = buf;
+
+  delete mem;
+}
@@ -0,0 +1,65 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_BufferReadSpeed_H_
+#define _OCL_BufferReadSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfBufferReadSpeed : public OCLTestImp {
+ public:
+  OCLPerfBufferReadSpeed();
+  virtual ~OCLPerfBufferReadSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 1000;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int bufSize_;
+  bool persistent;
+  bool allocHostPtr;
+  bool useHostPtr;
+  unsigned int numIter;
+  char* hostMem;
+  char* alignedMem;
+  size_t alignment;
+  unsigned int offset;
+  bool isAMD;
+  char platformVersion[32];
+};
+
+class OCLPerfBufferReadRectSpeed : public OCLPerfBufferReadSpeed {
+ public:
+  OCLPerfBufferReadRectSpeed() : OCLPerfBufferReadSpeed() {}
+
+ public:
+  virtual void run(void);
+};
+
+#endif  // _OCL_BufferReadSpeed_H_
@@ -0,0 +1,333 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfBufferWriteSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <complex>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 8
+// 256KB, 1 MB, 4MB, 16 MB
+static const unsigned int Sizes[NUM_SIZES] = {
+    1024, 32 * 1024, 64 * 1024, 128 * 1024, 262144, 1048576, 4194304, 16777216};
+
+static cl_uint blockedSubtests;
+
+static const unsigned int Iterations[2] = {1,
+                                           OCLPerfBufferWriteSpeed::NUM_ITER};
+
+#define NUM_OFFSETS 1
+static const unsigned int offsets[NUM_OFFSETS] = {0};
+#define NUM_SUBTESTS (3 + NUM_OFFSETS)
+extern const char *blkStr[2];
+
+OCLPerfBufferWriteSpeed::OCLPerfBufferWriteSpeed() {
+  _numSubTests = NUM_SIZES * NUM_SUBTESTS * 2;
+  blockedSubtests = _numSubTests;
+  _numSubTests += NUM_SIZES * NUM_SUBTESTS;
+}
+
+OCLPerfBufferWriteSpeed::~OCLPerfBufferWriteSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfBufferWriteSpeed::open(unsigned int test, char *units,
+                                   double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  outBuffer_ = 0;
+  persistent = false;
+  allocHostPtr = false;
+  useHostPtr = false;
+  hostMem = NULL;
+  alignedMem = NULL;
+  alignment = 4096;
+  isAMD = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        isAMD = true;
+      }
+      // platform = platforms[_platformIndex];
+      // break;
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  char getVersion[128];
+  error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION,
+                                       sizeof(getVersion), getVersion, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+  platformVersion[0] = getVersion[7];
+  platformVersion[1] = getVersion[8];
+  platformVersion[2] = getVersion[9];
+  platformVersion[3] = '\0';
+  bufSize_ = Sizes[_openTest % NUM_SIZES];
+
+  if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 2) {
+    useHostPtr = true;
+    offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 3];
+  } else if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 2) && isAMD) {
+    persistent = true;
+  } else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 1) {
+    allocHostPtr = true;
+  }
+
+  if (_openTest < blockedSubtests) {
+    numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS)];
+  } else {
+    numIter =
+        4 * OCLPerfBufferWriteSpeed::NUM_ITER / ((_openTest % NUM_SIZES) + 1);
+  }
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_READ_ONLY;
+  if (persistent) {
+    flags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
+  } else if (allocHostPtr) {
+    flags |= CL_MEM_ALLOC_HOST_PTR;
+  } else if (useHostPtr) {
+    flags |= CL_MEM_USE_HOST_PTR;
+    hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset);
+    CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed");
+    alignedMem =
+        (char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) +
+                 offset);
+  }
+  outBuffer_ =
+      _wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  // Force memory to be on GPU if possible
+  {
+    cl_mem memBuffer =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed");
+
+    _wrapper->clEnqueueCopyBuffer(cmd_queue_, outBuffer_, memBuffer, 0, 0,
+                                  bufSize_, 0, NULL, NULL);
+    _wrapper->clFinish(cmd_queue_);
+
+    _wrapper->clReleaseMemObject(memBuffer);
+  }
+}
+
+void OCLPerfBufferWriteSpeed::run(void) {
+  CPerfCounter timer;
+  char *mem = new char[bufSize_];
+  cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE;
+
+  // Warm up
+  error_ = _wrapper->clEnqueueWriteBuffer(cmd_queue_, outBuffer_, CL_TRUE, 0,
+                                          bufSize_, mem, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueReadBuffer failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < numIter; i++) {
+    error_ = _wrapper->clEnqueueWriteBuffer(cmd_queue_, outBuffer_, blocking, 0,
+                                            bufSize_, mem, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueReadBuffer failed");
+  }
+  if (blocking != CL_TRUE) {
+    _wrapper->clFinish(cmd_queue_);
+  }
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Buffer write bandwidth in GB/s
+  double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char str[256];
+  if (persistent) {
+    SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)");
+  } else if (allocHostPtr) {
+    SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)");
+  } else if (useHostPtr) {
+    SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset);
+  } else {
+    SNPRINTF(str, sizeof(str), "(GB/s)");
+  }
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %29s ", bufSize_,
+           blkStr[blocking], numIter, str);
+  testDescString = buf;
+
+  delete mem;
+}
+
+unsigned int OCLPerfBufferWriteSpeed::close(void) {
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+  if (hostMem) {
+    free(hostMem);
+  }
+
+  return _crcword;
+}
+
+void OCLPerfBufferWriteRectSpeed::run(void) {
+  CPerfCounter timer;
+  char *mem = new char[bufSize_];
+  size_t width = static_cast<size_t>(sqrt(static_cast<float>(bufSize_)));
+  size_t bufOrigin[3] = {0, 0, 0};
+  size_t hostOrigin[3] = {0, 0, 0};
+  size_t region[3] = {width, width, 1};
+  cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE;
+
+  // Skip for 1.0 platforms
+  if ((platformVersion[0] == '1') && (platformVersion[2] == '0')) {
+    char buf[256];
+    SNPRINTF(buf, sizeof(buf), " SKIPPED ");
+    testDescString = buf;
+    return;
+  }
+  // Warm up
+  error_ = _wrapper->clEnqueueWriteBufferRect(
+      cmd_queue_, outBuffer_, CL_TRUE, bufOrigin, hostOrigin, region, width, 0,
+      width, 0, mem, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueReadBufferRect failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < numIter; i++) {
+    error_ = _wrapper->clEnqueueWriteBufferRect(
+        cmd_queue_, outBuffer_, blocking, bufOrigin, hostOrigin, region, width,
+        0, width, 0, mem, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueReadBufferRect failed");
+  }
+  if (blocking != CL_TRUE) {
+    _wrapper->clFinish(cmd_queue_);
+  }
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Buffer write bandwidth in GB/s
+  double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char str[256];
+  if (persistent) {
+    SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)");
+  } else if (allocHostPtr) {
+    SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)");
+  } else if (useHostPtr) {
+    SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset);
+  } else {
+    SNPRINTF(str, sizeof(str), "(GB/s)");
+  }
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %29s ", bufSize_,
+           blkStr[blocking], numIter, str);
+  testDescString = buf;
+
+  delete mem;
+}
@@ -0,0 +1,65 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_BufferWriteSpeed_H_
+#define _OCL_BufferWriteSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfBufferWriteSpeed : public OCLTestImp {
+ public:
+  OCLPerfBufferWriteSpeed();
+  virtual ~OCLPerfBufferWriteSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 1000;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int bufSize_;
+  bool persistent;
+  bool allocHostPtr;
+  bool useHostPtr;
+  unsigned int numIter;
+  char* hostMem;
+  char* alignedMem;
+  size_t alignment;
+  unsigned int offset;
+  bool isAMD;
+  char platformVersion[32];
+};
+
+class OCLPerfBufferWriteRectSpeed : public OCLPerfBufferWriteSpeed {
+ public:
+  OCLPerfBufferWriteRectSpeed() : OCLPerfBufferWriteSpeed() {}
+
+ public:
+  virtual void run(void);
+};
+
+#endif  // _OCL_BufferWriteSpeed_H_
@@ -0,0 +1,304 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfCPUMemSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <algorithm>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 4
+// 256KB, 1 MB, 4MB, 16 MB
+static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304,
+                                              16777216};
+
+#define ITER_COUNT 2
+static const unsigned int Iterations[2] = {1, OCLPerfCPUMemSpeed::NUM_ITER};
+#define NUM_OFFSETS 1
+static const unsigned int offsets[NUM_OFFSETS] = {0};
+#define NUM_SUBTESTS (3 + NUM_OFFSETS)
+OCLPerfCPUMemSpeed::OCLPerfCPUMemSpeed() {
+  _numSubTests = NUM_SIZES * NUM_SUBTESTS * ITER_COUNT * 3;
+}
+
+OCLPerfCPUMemSpeed::~OCLPerfCPUMemSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfCPUMemSpeed::open(unsigned int test, char *units,
+                              double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  outBuffer_ = 0;
+  persistent = false;
+  allocHostPtr = false;
+  useHostPtr = false;
+  hostMem = NULL;
+  alignedMem = NULL;
+  alignment = 4096;
+  testMemset = false;
+  isAMD = false;
+  gpuSrc = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+      isAMD = true;
+    }
+
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    CHECK_RESULT(num_devices == 0, "No devices found, cannot proceed");
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    // if (num_devices > 0)
+    //{
+    //    platform = platforms[_platformIndex];
+    //    break;
+    //}
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  bufSize_ = Sizes[_openTest % NUM_SIZES];
+  if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 2) {
+    useHostPtr = true;
+    offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 3];
+  } else if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 2) && isAMD) {
+    persistent = true;
+  } else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 1) {
+    allocHostPtr = true;
+  }
+
+  numIter = Iterations[(_openTest / (NUM_SIZES * NUM_SUBTESTS)) % 2];
+  if (_openTest >= (NUM_SIZES * NUM_SUBTESTS * ITER_COUNT * 2))
+    testMemset = true;
+  else if (_openTest >= (NUM_SIZES * NUM_SUBTESTS * ITER_COUNT)) {
+    gpuSrc = true;
+    numIter = std::min(numIter, 10u);
+  }
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags;
+  if (gpuSrc) {
+    flags = CL_MEM_WRITE_ONLY;
+    mapFlags = CL_MAP_READ;
+  } else {
+    flags = CL_MEM_READ_ONLY;
+    mapFlags = CL_MAP_WRITE;
+  }
+  if (persistent) {
+    flags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
+  } else if (allocHostPtr) {
+    flags |= CL_MEM_ALLOC_HOST_PTR;
+  } else if (useHostPtr) {
+    flags |= CL_MEM_USE_HOST_PTR;
+    hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset);
+    CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed");
+    alignedMem =
+        (char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) +
+                 offset);
+  }
+  outBuffer_ =
+      _wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  // Force memory to be on GPU if possible
+  {
+    cl_mem memBuffer =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed");
+
+    _wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, outBuffer_, 0, 0,
+                                  bufSize_, 0, NULL, NULL);
+    _wrapper->clFinish(cmd_queue_);
+
+    _wrapper->clReleaseMemObject(memBuffer);
+  }
+}
+
+void OCLPerfCPUMemSpeed::run(void) {
+  CPerfCounter timer;
+
+  void *mem;
+  // Warm up
+  mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE, mapFlags,
+                                     0, bufSize_, 0, NULL, NULL, &error_);
+
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0,
+                                             NULL, NULL);
+  CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed");
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE, mapFlags,
+                                     0, bufSize_, 0, NULL, NULL, &error_);
+
+  char *cpumem = new char[bufSize_];
+
+  timer.Reset();
+  timer.Start();
+  if (testMemset) {
+    for (unsigned int i = 0; i < numIter; i++) {
+      memset(mem, 0, bufSize_);
+    }
+  } else {
+    if (gpuSrc) {
+      for (unsigned int i = 0; i < numIter; i++) {
+        memcpy((void *)cpumem, mem, bufSize_);
+      }
+    } else {
+      for (unsigned int i = 0; i < numIter; i++) {
+        memcpy(mem, (void *)cpumem, bufSize_);
+      }
+    }
+  }
+
+  timer.Stop();
+
+  delete[] cpumem;
+
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0,
+                                             NULL, NULL);
+  CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed");
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  double sec = timer.GetElapsedTime();
+
+  // Map read bandwidth in GB/s
+  double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
+  _perfInfo = (float)perf;
+
+  char str[256];
+  if (persistent) {
+    SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)");
+  } else if (allocHostPtr) {
+    SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)");
+  } else if (useHostPtr) {
+    SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset);
+  } else {
+    SNPRINTF(str, sizeof(str), "(GB/s)");
+  }
+  const char *str2 = NULL;
+  if (testMemset)
+    str2 = "memset to dev";
+  else {
+    if (gpuSrc)
+      str2 = "memcpy from dev";
+    else
+      str2 = "memcpy to dev";
+  }
+
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) %15s i: %4d %29s ", bufSize_, str2,
+           numIter, str);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfCPUMemSpeed::close(void) {
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+  if (hostMem) {
+    free(hostMem);
+  }
+
+  return _crcword;
+}
@@ -0,0 +1,59 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_CPUMemSpeed_H_
+#define _OCL_CPUMemSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfCPUMemSpeed : public OCLTestImp {
+ public:
+  OCLPerfCPUMemSpeed();
+  virtual ~OCLPerfCPUMemSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 100;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int bufSize_;
+  bool persistent;
+  bool allocHostPtr;
+  bool useHostPtr;
+  unsigned int numIter;
+  bool testMemset;
+  char* hostMem;
+  char* alignedMem;
+  size_t alignment;
+  unsigned int offset;
+  bool isAMD;
+  bool gpuSrc;
+  cl_map_flags mapFlags;
+};
+
+#endif  // _OCL_CPUMemSpeed_H_
@@ -0,0 +1,146 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfCommandQueue.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include <sstream>
+#include <string>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+static const size_t BufSize = 0x1000;
+static const size_t Iterations = 0x100;
+static const size_t TotalQueues = 4;
+static const size_t TotalBufs = 4;
+
+OCLPerfCommandQueue::OCLPerfCommandQueue() {
+  _numSubTests = TotalQueues * TotalBufs;
+  failed_ = false;
+}
+
+OCLPerfCommandQueue::~OCLPerfCommandQueue() {}
+
+void OCLPerfCommandQueue::open(unsigned int test, char* units,
+                               double& conversion, unsigned int deviceId) {
+  cl_mem buffer;
+  _deviceId = deviceId;
+  CPerfCounter timer;
+  timer.Reset();
+  timer.Start();
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  timer.Stop();
+  if (test == 0) {
+    printf("Runtime load/init time: %0.2f ms\n",
+           static_cast<float>(timer.GetElapsedTime() * 1000));
+  }
+  test_ = test;
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    printf("GPU device is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+  static const size_t MemObjects[] = {1, 100, 1000, 5000};
+  size_t numMems = MemObjects[test_ / TotalBufs];
+  size_t bufSize = BufSize * sizeof(cl_int4);
+  for (size_t b = 0; b < numMems; ++b) {
+    buffer = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, bufSize,
+                                      NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+    buffers_.push_back(buffer);
+  }
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLPerfCommandQueue::run(void) {
+  if (failed_) {
+    return;
+  }
+  unsigned int* values;
+  values = reinterpret_cast<unsigned int*>(new cl_int4[BufSize]);
+  CPerfCounter timer;
+  static const size_t Queues[] = {1, 2, 4, 8};
+  size_t numQueues = Queues[test_ % TotalQueues];
+
+  // Clear destination buffer
+  memset(values, 0, BufSize * sizeof(cl_int4));
+
+  size_t iter =
+      Iterations / (numQueues * ((size_t)1 << (test_ / TotalBufs + 1)));
+  std::vector<cl_command_queue> cmdQueues(numQueues);
+
+  timer.Reset();
+  timer.Start();
+
+  for (size_t i = 0; i < iter; ++i) {
+    for (size_t q = 0; q < numQueues; ++q) {
+      cl_command_queue cmdQueue = _wrapper->clCreateCommandQueue(
+          context_, devices_[_deviceId], 0, &error_);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed");
+      cmdQueues[q] = cmdQueue;
+    }
+    timer.Stop();
+    for (size_t q = 0; q < numQueues; ++q) {
+      for (size_t b = 0; b < buffers_.size(); ++b) {
+        error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues[q], buffers_[b],
+                                                CL_TRUE, 0, sizeof(cl_int4),
+                                                values, 0, NULL, NULL);
+      }
+    }
+    timer.Start();
+    for (size_t q = 0; q < numQueues; ++q) {
+      error_ = _wrapper->clReleaseCommandQueue(cmdQueues[q]);
+      CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                             "clReleaseCommandQueue() failed");
+    }
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+  }
+
+  timer.Stop();
+
+  std::stringstream stream;
+
+  stream << "Create+destroy time for " << numQueues << " queues and "
+         << buffers_.size() << " buffers";
+  stream.precision(3);
+  stream.width(5);
+  stream.setf(std::ios::fixed, std::ios::floatfield);
+  stream << "(ms)";
+  testDescString = stream.str();
+  _perfInfo =
+      static_cast<float>(timer.GetElapsedTime() * 1000 / (iter * numQueues));
+  delete[] values;
+}
+
+unsigned int OCLPerfCommandQueue::close(void) { return OCLTestImp::close(); }
@@ -0,0 +1,42 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PERF_COMMAND_QUEUE_H_
+#define _OCL_PERF_COMMAND_QUEUE_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfCommandQueue : public OCLTestImp {
+ public:
+  OCLPerfCommandQueue();
+  virtual ~OCLPerfCommandQueue();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+  unsigned int test_;
+};
+
+#endif  // _OCL_PERF_COMMAND_QUEUE_H_
@@ -0,0 +1,563 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfConcurrency.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+typedef struct {
+  double x;
+  double y;
+  double width;
+} coordRec;
+
+static coordRec coords[] = {
+    {0.0, 0.0, 0.00001},  // All black
+};
+
+static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
+
+static const char *float_mandel_vec =
+    "__kernel void mandelbrot(__global uint *out, uint width, float xPos, "
+    "float yPos, float xStep, float yStep, uint maxIter)\n"
+    "{\n"
+    "    int tid = get_global_id(0);\n"
+    "    int i = tid % (width/4);\n"
+    "    int j = tid / (width/4);\n"
+    "    int4 veci = (int4)(4*i, 4*i+1, 4*i+2, 4*i+3);\n"
+    "    int4 vecj = (int4)(j, j, j, j);\n"
+    "    float4 x0;\n"
+    "    x0.s0 = (float)(xPos + xStep*veci.s0);\n"
+    "    x0.s1 = (float)(xPos + xStep*veci.s1);\n"
+    "    x0.s2 = (float)(xPos + xStep*veci.s2);\n"
+    "    x0.s3 = (float)(xPos + xStep*veci.s3);\n"
+    "    float4 y0;\n"
+    "    y0.s0 = (float)(yPos + yStep*vecj.s0);\n"
+    "    y0.s1 = (float)(yPos + yStep*vecj.s1);\n"
+    "    y0.s2 = (float)(yPos + yStep*vecj.s2);\n"
+    "    y0.s3 = (float)(yPos + yStep*vecj.s3);\n"
+    "\n"
+    "    float4 x = x0;\n"
+    "    float4 y = y0;\n"
+    "\n"
+    "    uint iter = 0;\n"
+    "    float4 tmp;\n"
+    "    int4 stay;\n"
+    "    int4 ccount = 0;\n"
+    "    float4 savx = x;\n"
+    "    float4 savy = y;\n"
+    "    stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n"
+    "    for (iter = 0; (stay.s0 | stay.s1 | stay.s2 | stay.s3) && (iter < "
+    "maxIter); iter+=16)\n"
+    "    {\n"
+    "        x = savx;\n"
+    "        y = savy;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n"
+    "        savx = (stay ? x : savx);\n"
+    "        savy = (stay ? y : savy);\n"
+    "        ccount -= stay*16;\n"
+    "    }\n"
+    "    // Handle remainder\n"
+    "    if (!(stay.s0 & stay.s1 & stay.s2 & stay.s3))\n"
+    "    {\n"
+    "        iter = 16;\n"
+    "        do\n"
+    "        {\n"
+    "            x = savx;\n"
+    "            y = savy;\n"
+    "            // More efficient to use scalar ops here: Why?\n"
+    "            stay.s0 = ((x.s0*x.s0+y.s0*y.s0) <= 4.0f) && (ccount.s0 < "
+    "maxIter);\n"
+    "            stay.s1 = ((x.s1*x.s1+y.s1*y.s1) <= 4.0f) && (ccount.s1 < "
+    "maxIter);\n"
+    "            stay.s2 = ((x.s2*x.s2+y.s2*y.s2) <= 4.0f) && (ccount.s2 < "
+    "maxIter);\n"
+    "            stay.s3 = ((x.s3*x.s3+y.s3*y.s3) <= 4.0f) && (ccount.s3 < "
+    "maxIter);\n"
+    "            tmp = x;\n"
+    "            x = x*x + x0 - y*y;\n"
+    "            y = 2.0f*tmp*y + y0;\n"
+    "            ccount += stay;\n"
+    "            iter--;\n"
+    "            savx.s0 = (stay.s0 ? x.s0 : savx.s0);\n"
+    "            savx.s1 = (stay.s1 ? x.s1 : savx.s1);\n"
+    "            savx.s2 = (stay.s2 ? x.s2 : savx.s2);\n"
+    "            savx.s3 = (stay.s3 ? x.s3 : savx.s3);\n"
+    "            savy.s0 = (stay.s0 ? y.s0 : savy.s0);\n"
+    "            savy.s1 = (stay.s1 ? y.s1 : savy.s1);\n"
+    "            savy.s2 = (stay.s2 ? y.s2 : savy.s2);\n"
+    "            savy.s3 = (stay.s3 ? y.s3 : savy.s3);\n"
+    "        } while ((stay.s0 | stay.s1 | stay.s2 | stay.s3) && iter);\n"
+    "    }\n"
+    "    __global uint4 *vecOut = (__global uint4 *)out;\n"
+    "    vecOut[tid] = convert_uint4(ccount);\n"
+    "}\n";
+
+OCLPerfConcurrency::OCLPerfConcurrency() { _numSubTests = 10 * numCoords; }
+
+OCLPerfConcurrency::~OCLPerfConcurrency() {}
+
+void OCLPerfConcurrency::setData(cl_mem buffer, unsigned int val) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_[0], buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  for (unsigned int i = 0; i < width_; i++) data[i] = val;
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_[0], buffer, data, 0,
+                                             NULL, NULL);
+  _wrapper->clFinish(cmd_queue_[0]);
+}
+
+void OCLPerfConcurrency::checkData(cl_mem buffer) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_[0], buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  totalIters = 0;
+  for (unsigned int i = 0; i < width_; i++) {
+    totalIters += data[i];
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_[0], buffer, data, 0,
+                                             NULL, NULL);
+  _wrapper->clFinish(cmd_queue_[0]);
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfConcurrency::open(unsigned int test, char *units,
+                              double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  unsigned int i;
+
+  if (type_ != CL_DEVICE_TYPE_GPU) {
+    char msg[256];
+    SNPRINTF(msg, sizeof(msg), "No GPU devices present. Exiting!\t");
+    testDescString = msg;
+    return;
+  }
+
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+
+  for (i = 0; i < MAX_ASYNC_QUEUES; i++) {
+    cmd_queue_[i] = 0;
+    program_[i] = 0;
+    kernel_[i] = 0;
+    outBuffer_[i] = 0;
+  }
+
+  // Maximum iteration count
+  // NOTE: Some kernels are unrolled 16 times, so make sure maxIter is divisible
+  // by 16 NOTE: Can increase to get better peak performance numbers, but be
+  // sure not to TDR slow ASICs! NOTE:. for warmup run we use maxIter = 256 and
+  // then for the actual run we use maxIter = 8388608 * (engine_clock / 1000).
+  maxIter = 256;
+
+  // NOTE: Width needs to be divisible by 4 because the float_mandel_vec kernel
+  // processes 4 pixels at once NOTE: Can increase to get better peak
+  // performance numbers, but be sure not to TDR slow ASICs!
+  width_ = 256;
+
+  // We compute a square domain
+  bufSize_ = width_ * sizeof(cl_uint);
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    // if (num_devices > 0)
+    //{
+    //    platform = platforms[_platformIndex];
+    //    break;
+    //}
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  cl_uint numAsyncQueues;
+  error_ = _wrapper->clGetDeviceInfo(
+      device, CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD, sizeof(numAsyncQueues),
+      &numAsyncQueues, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  CHECK_RESULT(numAsyncQueues > MAX_ASYNC_QUEUES,
+               "numAsyncQueues is too large for this test");
+
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
+                                     sizeof(size_t), &numCUs, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  switch (_openTest) {
+    case 0:
+      num_cmd_queues = num_programs = num_kernels = num_outbuffers = 1;
+      break;
+
+    case 1:
+      num_cmd_queues = 1;
+      num_programs = 1;
+      num_kernels = 1;
+      num_outbuffers = 2;
+      break;
+
+    case 2:
+      num_cmd_queues = 1;
+      num_programs = 2;
+      num_kernels = 2;
+      num_outbuffers = 2;
+      break;
+
+    case 3:
+      num_cmd_queues = num_programs = num_kernels = num_outbuffers = 2;
+      break;
+
+    case 4:
+    case 5:
+    case 6:
+    case 7:
+    case 8:
+    case 9:
+      num_cmd_queues = num_programs = num_kernels = num_outbuffers =
+          numAsyncQueues % 8;
+      break;
+
+    default:
+      break;
+  }
+
+  for (i = 0; i < num_cmd_queues; i++) {
+    cmd_queue_[i] = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+    CHECK_RESULT(cmd_queue_[i] == 0, "clCreateCommandQueue failed");
+  }
+
+  for (i = 0; i < num_outbuffers; i++) {
+    outBuffer_[i] =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(outBuffer_[i] == 0, "clCreateBuffer(outBuffer) failed");
+  }
+
+  const char *tmp;
+  tmp = float_mandel_vec;
+
+  for (i = 0; i < num_programs; i++) {
+    program_[i] = _wrapper->clCreateProgramWithSource(
+        context_, 1, (const char **)&tmp, NULL, &error_);
+    CHECK_RESULT(program_[i] == 0, "clCreateProgramWithSource failed");
+
+    error_ = _wrapper->clBuildProgram(program_[i], 1, &device, "", NULL, NULL);
+
+    if (error_ != CL_SUCCESS) {
+      cl_int intError;
+      char log[16384];
+      intError = _wrapper->clGetProgramBuildInfo(
+          program_[i], device, CL_PROGRAM_BUILD_LOG, 16384 * sizeof(char), log,
+          NULL);
+      printf("Build error -> %s\n", log);
+
+      CHECK_RESULT(0, "clBuildProgram failed");
+    }
+  }
+
+  for (i = 0; i < num_kernels; i++) {
+    kernel_[i] = _wrapper->clCreateKernel(program_[i], "mandelbrot", &error_);
+    CHECK_RESULT(kernel_[i] == 0, "clCreateKernel failed");
+  }
+
+  coordIdx = _openTest % numCoords;
+  float xStep = (float)(coords[coordIdx].width / (double)width_);
+  float yStep = (float)(-coords[coordIdx].width / (double)width_);
+  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
+  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
+
+  for (i = 0; i < num_kernels; i++) {
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 0, sizeof(cl_mem),
+                                      (void *)&outBuffer_[i]);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 1, sizeof(cl_uint),
+                                      (void *)&width_);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 2, sizeof(cl_float),
+                                      (void *)&xPos);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 3, sizeof(cl_float),
+                                      (void *)&yPos);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 4, sizeof(cl_float),
+                                      (void *)&xStep);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 5, sizeof(cl_float),
+                                      (void *)&yStep);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 6, sizeof(cl_uint),
+                                      (void *)&maxIter);
+  }
+
+  for (i = 0; i < num_outbuffers; i++) {
+    setData(outBuffer_[i], 0xdeadbeef);
+  }
+
+  unsigned int clkFrequency = 0;
+  error_ = clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY,
+                           sizeof(clkFrequency), &clkFrequency, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  assert(clkFrequency > 0);
+  maxIter =
+      (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs) / 128);
+  maxIter = (maxIter + 15) & ~15;
+}
+
+void OCLPerfConcurrency::run(void) {
+  // Test runs only on GPU
+  if (type_ != CL_DEVICE_TYPE_GPU) return;
+
+  int global = width_ >> 2;
+  // We handle 4 pixels per thread
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+  unsigned int i;
+
+  // Warmup
+  for (i = 0; i < num_kernels; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_[i % num_cmd_queues], kernel_[i], 1, NULL,
+        (const size_t *)global_work_size, (const size_t *)local_work_size, 0,
+        NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+
+  for (i = 0; i < num_cmd_queues; i++) {
+    _wrapper->clFlush(cmd_queue_[i]);
+  }
+
+  for (i = 0; i < num_cmd_queues; i++) {
+    _wrapper->clFinish(cmd_queue_[i]);
+  }
+
+  for (i = 0; i < num_kernels; i++) {
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 6, sizeof(cl_uint),
+                                      (void *)&maxIter);
+  }
+
+  CPerfCounter timer;
+
+  timer.Reset();
+  timer.Start();
+
+  for (i = 0; i < num_kernels; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_[i % num_cmd_queues], kernel_[i], 1, NULL,
+        (const size_t *)global_work_size, (const size_t *)local_work_size, 0,
+        NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+
+  if (_openTest == 1) {
+    error_ = _wrapper->clSetKernelArg(kernel_[0], 0, sizeof(cl_mem),
+                                      (void *)&outBuffer_[1]);
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_[0], kernel_[0], 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+
+  for (i = 0; i < num_cmd_queues; i++) {
+    _wrapper->clFlush(cmd_queue_[i]);
+  }
+
+  for (i = 0; i < num_cmd_queues; i++) {
+    _wrapper->clFinish(cmd_queue_[i]);
+  }
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  unsigned long long expected =
+      (unsigned long long)width_ * (unsigned long long)maxIter;
+
+  for (i = 0; i < num_outbuffers; i++) {
+    checkData(outBuffer_[i]);
+    CHECK_RESULT(totalIters != expected, "Incorrect iteration count detected!");
+  }
+
+  _perfInfo = (float)sec;
+  if (_openTest == 0)
+    testDescString = "time for 1 kernel  (s)               ";
+  else if (_openTest == 1)
+    testDescString = "time for 2 kernels (s) (same kernel) ";
+  else if (_openTest == 2)
+    testDescString = "time for 2 kernels (s) (diff kernels)";
+  else {
+    char buf[128];
+    SNPRINTF(buf, sizeof(buf), "time for %d kernels (s) (   %d queues) ",
+             num_kernels, num_cmd_queues);
+    testDescString = buf;
+  }
+}
+
+unsigned int OCLPerfConcurrency::close(void) {
+  unsigned int i;
+
+  // Test runs only on GPU
+  if (type_ != CL_DEVICE_TYPE_GPU) return 0;
+
+  _wrapper->clFinish(cmd_queue_[0]);
+
+  for (i = 0; i < num_outbuffers; i++) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_[i]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+
+  for (i = 0; i < num_kernels; i++) {
+    error_ = _wrapper->clReleaseKernel(kernel_[i]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseKernel(kernel_) failed");
+  }
+
+  for (i = 0; i < num_programs; i++) {
+    error_ = _wrapper->clReleaseProgram(program_[i]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseProgram(program_) failed");
+  }
+
+  for (i = 0; i < num_cmd_queues; i++) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_[i]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
@@ -0,0 +1,63 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_Perf_Concurrency_H_
+#define _OCL_Perf_Concurrency_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfConcurrency : public OCLTestImp {
+ public:
+  OCLPerfConcurrency();
+  virtual ~OCLPerfConcurrency();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void setData(cl_mem buffer, unsigned int data);
+  void checkData(cl_mem buffer);
+
+#define MAX_ASYNC_QUEUES 8
+
+  cl_context context_;
+  cl_command_queue cmd_queue_[MAX_ASYNC_QUEUES];
+  cl_program program_[MAX_ASYNC_QUEUES];
+  cl_kernel kernel_[MAX_ASYNC_QUEUES];
+  cl_mem outBuffer_[MAX_ASYNC_QUEUES];
+  cl_int error_;
+
+  unsigned int num_cmd_queues;
+  unsigned int num_programs;
+  unsigned int num_kernels;
+  unsigned int num_outbuffers;
+
+  unsigned int width_;
+  unsigned int bufSize_;
+  unsigned int maxIter;
+  unsigned int coordIdx;
+  unsigned long long totalIters;
+  size_t numCUs;
+};
+
+#endif  // _OCL_Perf_Concurrency_H_
@@ -0,0 +1,243 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfDevMemReadSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 1
+static const unsigned int Sizes[NUM_SIZES] = {256 * 1024 * 1024};
+
+const static char *strKernel =
+    "__kernel void read_kernel(__global uint16 *src, ulong size1, uint "
+    "threads, __global uint* dst\n"
+    "                          )\n"
+    "{\n"
+    "    uint16 pval;\n"
+    "    int idx = get_global_id(0);\n"
+    "    __global uint16 *srcEnd = src + size1;\n"
+    "     uint tmp = 0;\n"
+    "    src = &src[idx];"
+    "    while (src < srcEnd) \n"
+    "        {\n"
+    "            pval = *src;\n"
+    "            src += threads;\n"
+    "            tmp += pval.s0 + pval.s1 + pval.s2 + pval.s3 + pval.s4 + pval.s5 + pval.s6 + \
+  pval.s7 + pval.s8 + pval.s9 + pval.sa + pval.sb + pval.sc + pval.sd + pval.se + pval.sf;\n"
+    "        }\n"
+    "    atomic_add(dst, tmp);\n"
+    "}\n";
+
+OCLPerfDevMemReadSpeed::OCLPerfDevMemReadSpeed() { _numSubTests = 1; }
+
+OCLPerfDevMemReadSpeed::~OCLPerfDevMemReadSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfDevMemReadSpeed::open(unsigned int test, char *units,
+                                  double &conversion, unsigned int deviceId) {
+  error_ = CL_SUCCESS;
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  program_ = 0;
+  kernel_ = 0;
+  skip_ = false;
+  dstBuffer_ = 0;
+  nBytes = Sizes[0];
+  cl_ulong loopCnt = nBytes / (16 * sizeof(cl_uint));
+  cl_uint maxCUs;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId],
+                                     CL_DEVICE_MAX_COMPUTE_UNITS,
+                                     sizeof(cl_uint), &maxCUs, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  wgs = 64;
+  const static cl_uint wavesPerCU = 8;
+  nWorkItems = maxCUs * wavesPerCU * wgs;
+
+  inputData = 0x1;
+  nIter = 1000;
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "read_kernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  srcBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, nBytes,
+                                        NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer(srcBuffer) failed");
+  void *mem;
+  mem = _wrapper->clEnqueueMapBuffer(cmdQueues_[_deviceId], srcBuffer_, CL_TRUE,
+                                     CL_MAP_READ | CL_MAP_WRITE, 0, nBytes, 0,
+                                     NULL, NULL, &error_);
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  for (unsigned int i = 0; i < nBytes / sizeof(cl_uint); ++i) {
+    reinterpret_cast<cl_uint *>(mem)[i] = inputData;
+  }
+
+  dstBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
+                                        sizeof(cl_uint), NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer(dstBuffer) failed");
+  _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], srcBuffer_, mem, 0,
+                                    NULL, NULL);
+  mem = _wrapper->clEnqueueMapBuffer(cmdQueues_[_deviceId], dstBuffer_, CL_TRUE,
+                                     CL_MAP_READ | CL_MAP_WRITE, 0,
+                                     sizeof(cl_uint), 0, NULL, NULL, &error_);
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  memset(mem, 0, sizeof(cl_uint));
+  _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], dstBuffer_, mem, 0,
+                                    NULL, NULL);
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &srcBuffer_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_ulong), (void *)&loopCnt);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint),
+                                    (void *)&nWorkItems);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_mem), (void *)&dstBuffer_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+}
+
+void OCLPerfDevMemReadSpeed::run(void) {
+  if (skip_) {
+    return;
+  }
+
+  CPerfCounter timer;
+
+  size_t gws[1] = {nWorkItems};
+  size_t lws[1] = {wgs};
+
+  // warm up
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, lws, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  cl_uint *memResult;
+  memResult = (cl_uint *)malloc(sizeof(cl_uint));
+  if (0 == memResult) {
+    CHECK_RESULT_NO_RETURN(0, "malloc failed!\n");
+    return;
+  }
+
+  memset(memResult, 0, sizeof(cl_uint));
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], dstBuffer_,
+                                         CL_FALSE, 0, sizeof(cl_uint),
+                                         memResult, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueReadBuffer dstBuffer_ failed!");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  if (memResult[0] != (nBytes / sizeof(cl_uint))) {
+    CHECK_RESULT_NO_RETURN(0, "Data validation failed for warm up run!\n");
+    free(memResult);
+    return;
+  }
+
+  free(memResult);
+
+  timer.Reset();
+  timer.Start();
+  double sec2 = 0;
+  cl_event *events = new cl_event[nIter];
+  for (unsigned int i = 0; i < nIter; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmdQueues_[_deviceId], kernel_, 1, NULL, gws, lws, 0, NULL, &events[i]);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  }
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  timer.Stop();
+  for (unsigned int i = 0; i < nIter; i++) {
+    cl_ulong startTime = 0, endTime = 0;
+    error_ = _wrapper->clGetEventProfilingInfo(
+        events[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0);
+    CHECK_RESULT(error_, "clGetEventProfilingInfo failed");
+    error_ = _wrapper->clGetEventProfilingInfo(
+        events[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0);
+    CHECK_RESULT(error_, "clGetEventProfilingInfo failed");
+
+    _wrapper->clReleaseEvent(events[i]);
+    sec2 += endTime - startTime;
+  }
+  double sec = timer.GetElapsedTime();
+  delete[] events;
+
+  // read speed in GB/s
+  double perf = ((double)nBytes * nIter * (double)(1e-09)) / sec;
+  double perf2 = ((double)nBytes * nIter) / sec2;
+  _perfInfo = (float)perf2;
+  float perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) i:%4d Wall time Perf: %.2f (GB/s)",
+           nBytes, nIter, perfInfo);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfDevMemReadSpeed::close(void) {
+  if (!skip_) {
+    if (srcBuffer_) {
+      error_ = _wrapper->clReleaseMemObject(srcBuffer_);
+      CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                             "clReleaseMemObject(srcBuffer_) failed");
+    }
+
+    if (dstBuffer_) {
+      error_ = _wrapper->clReleaseMemObject(dstBuffer_);
+      CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                             "clReleaseMemObject(srcBuffer_) failed");
+    }
+  }
+
+  return OCLTestImp::close();
+}
@@ -0,0 +1,47 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_DevMemReadSpeed_H_
+#define _OCL_DevMemReadSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfDevMemReadSpeed : public OCLTestImp {
+ public:
+  OCLPerfDevMemReadSpeed();
+  virtual ~OCLPerfDevMemReadSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  cl_mem srcBuffer_;
+  cl_mem dstBuffer_;
+  unsigned int nWorkItems;  // number of GPU work items
+  unsigned int wgs;         // work group size
+  unsigned int nBytes;      // input and output buffer size
+  unsigned int nIter;       // overall number of timing loops
+  cl_uint inputData;        // input data to fill the input buffer
+  bool skip_;
+};
+
+#endif  // _OCL_DevMemReadSpeed_H_
@@ -0,0 +1,212 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfDevMemWriteSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 1
+static const unsigned int Sizes[NUM_SIZES] = {256 * 1024 * 1024};
+
+const static char *strKernel =
+
+    "__kernel void write_kernel(__global uint16 *dst, ulong size1, uint "
+    "threads\n"
+    "                          )\n"
+    "{\n"
+    "    uint16 pval = (uint16)(0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab,\
+ 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab);\n"
+    "    int idx = get_global_id(0);\n"
+    "    __global uint16 *dstEnd = dst + size1;\n"
+    "    dst = &dst[idx];"
+    "    do\n"
+    "        {\n"
+    "            *dst = pval;\n"
+    "            dst += threads;\n"
+    "        }\n"
+    "    while (dst < dstEnd);\n"
+    "}\n";
+
+OCLPerfDevMemWriteSpeed::OCLPerfDevMemWriteSpeed() { _numSubTests = 1; }
+
+OCLPerfDevMemWriteSpeed::~OCLPerfDevMemWriteSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfDevMemWriteSpeed::open(unsigned int test, char *units,
+                                   double &conversion, unsigned int deviceId) {
+  error_ = CL_SUCCESS;
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  program_ = 0;
+  kernel_ = 0;
+  skip_ = false;
+  dstBuffer_ = 0;
+  nBytes = Sizes[0];
+  cl_ulong loopCnt = nBytes / (16 * sizeof(cl_uint));
+  cl_uint maxCUs;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId],
+                                     CL_DEVICE_MAX_COMPUTE_UNITS,
+                                     sizeof(cl_uint), &maxCUs, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  wgs = 64;
+  const static cl_uint wavesPerCU = 8;
+  nWorkItems = maxCUs * wavesPerCU * wgs;
+  inputData = 0xabababab;
+  nIter = 1000;
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "write_kernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  dstBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, nBytes,
+                                        NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer(dstBuffer) failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &dstBuffer_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_ulong), (void *)&loopCnt);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint),
+                                    (void *)&nWorkItems);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+}
+
+void OCLPerfDevMemWriteSpeed::run(void) {
+  if (skip_) {
+    return;
+  }
+
+  CPerfCounter timer;
+
+  size_t gws[1] = {nWorkItems};
+  size_t lws[1] = {wgs};
+
+  // warm up
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, lws, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  cl_uint *memResult;
+  memResult = (cl_uint *)malloc(nBytes);
+  if (0 == memResult) {
+    CHECK_RESULT_NO_RETURN(0, "malloc failed!\n");
+    return;
+  }
+
+  memset(memResult, 0, nBytes);
+  error_ =
+      _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], dstBuffer_, CL_FALSE,
+                                    0, nBytes, memResult, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueReadBuffer dstBuffer_ failed!");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  for (unsigned int i = 0; i < nBytes / sizeof(cl_uint); i++) {
+    if (((cl_uint *)memResult)[i] != inputData) {
+      CHECK_RESULT_NO_RETURN(0, "Data validation failed for warm up run!\n");
+      free(memResult);
+      return;
+    }
+  }
+
+  free(memResult);
+
+  timer.Reset();
+  timer.Start();
+  double sec2 = 0;
+  cl_event *events = new cl_event[nIter];
+  for (unsigned int i = 0; i < nIter; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmdQueues_[_deviceId], kernel_, 1, NULL, gws, lws, 0, NULL, &events[i]);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  }
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  timer.Stop();
+  for (unsigned int i = 0; i < nIter; i++) {
+    cl_ulong startTime = 0, endTime = 0;
+    error_ = _wrapper->clGetEventProfilingInfo(
+        events[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0);
+    CHECK_RESULT(error_, "clGetEventProfilingInfo failed");
+    error_ = _wrapper->clGetEventProfilingInfo(
+        events[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0);
+    CHECK_RESULT(error_, "clGetEventProfilingInfo failed");
+
+    _wrapper->clReleaseEvent(events[i]);
+    sec2 += endTime - startTime;
+  }
+  double sec = timer.GetElapsedTime();
+  delete[] events;
+
+  // write speed in GB/s
+  double perf = ((double)nBytes * nIter * (double)(1e-09)) / sec;
+  double perf2 = ((double)nBytes * nIter) / sec2;
+  _perfInfo = (float)perf2;
+  float perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) i:%4d Wall time Perf: %.2f (GB/s)",
+           nBytes, nIter, perfInfo);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfDevMemWriteSpeed::close(void) {
+  if (!skip_) {
+    if (dstBuffer_) {
+      error_ = _wrapper->clReleaseMemObject(dstBuffer_);
+      CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                             "clReleaseMemObject(srcBuffer_) failed");
+    }
+  }
+
+  return OCLTestImp::close();
+}
@@ -0,0 +1,46 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_DevMemWriteSpeed_H_
+#define _OCL_DevMemWriteSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfDevMemWriteSpeed : public OCLTestImp {
+ public:
+  OCLPerfDevMemWriteSpeed();
+  virtual ~OCLPerfDevMemWriteSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  cl_mem dstBuffer_;
+  unsigned int nWorkItems;  // number of GPU work items
+  unsigned int wgs;         // work group size
+  unsigned int nBytes;      // output buffer size
+  unsigned int nIter;       // overall number of timing loops
+  cl_uint inputData;        // input data to fill the input buffer
+  bool skip_;
+};
+
+#endif  // _OCL_DevMemWriteSpeed_H_
@@ -0,0 +1,480 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfDeviceConcurrency.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+typedef struct {
+  double x;
+  double y;
+  double width;
+} coordRec;
+
+static coordRec coords[] = {
+    {0.0, 0.0, 0.00001},  // All black
+};
+
+static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
+
+static const char *float_mandel_vec =
+    "__kernel void mandelbrot(__global uint *out, uint width, float xPos, "
+    "float yPos, float xStep, float yStep, uint maxIter)\n"
+    "{\n"
+    "    int tid = get_global_id(0);\n"
+    "    int i = tid % (width/4);\n"
+    "    int j = tid / (width/4);\n"
+    "    int4 veci = (int4)(4*i, 4*i+1, 4*i+2, 4*i+3);\n"
+    "    int4 vecj = (int4)(j, j, j, j);\n"
+    "    float4 x0;\n"
+    "    x0.s0 = (float)(xPos + xStep*veci.s0);\n"
+    "    x0.s1 = (float)(xPos + xStep*veci.s1);\n"
+    "    x0.s2 = (float)(xPos + xStep*veci.s2);\n"
+    "    x0.s3 = (float)(xPos + xStep*veci.s3);\n"
+    "    float4 y0;\n"
+    "    y0.s0 = (float)(yPos + yStep*vecj.s0);\n"
+    "    y0.s1 = (float)(yPos + yStep*vecj.s1);\n"
+    "    y0.s2 = (float)(yPos + yStep*vecj.s2);\n"
+    "    y0.s3 = (float)(yPos + yStep*vecj.s3);\n"
+    "\n"
+    "    float4 x = x0;\n"
+    "    float4 y = y0;\n"
+    "\n"
+    "    uint iter = 0;\n"
+    "    float4 tmp;\n"
+    "    int4 stay;\n"
+    "    int4 ccount = 0;\n"
+    "    float4 savx = x;\n"
+    "    float4 savy = y;\n"
+    "    stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n"
+    "    for (iter = 0; (stay.s0 | stay.s1 | stay.s2 | stay.s3) && (iter < "
+    "maxIter); iter+=16)\n"
+    "    {\n"
+    "        x = savx;\n"
+    "        y = savy;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n"
+    "        savx = (stay ? x : savx);\n"
+    "        savy = (stay ? y : savy);\n"
+    "        ccount -= stay*16;\n"
+    "    }\n"
+    "    // Handle remainder\n"
+    "    if (!(stay.s0 & stay.s1 & stay.s2 & stay.s3))\n"
+    "    {\n"
+    "        iter = 16;\n"
+    "        do\n"
+    "        {\n"
+    "            x = savx;\n"
+    "            y = savy;\n"
+    "            // More efficient to use scalar ops here: Why?\n"
+    "            stay.s0 = ((x.s0*x.s0+y.s0*y.s0) <= 4.0f) && (ccount.s0 < "
+    "maxIter);\n"
+    "            stay.s1 = ((x.s1*x.s1+y.s1*y.s1) <= 4.0f) && (ccount.s1 < "
+    "maxIter);\n"
+    "            stay.s2 = ((x.s2*x.s2+y.s2*y.s2) <= 4.0f) && (ccount.s2 < "
+    "maxIter);\n"
+    "            stay.s3 = ((x.s3*x.s3+y.s3*y.s3) <= 4.0f) && (ccount.s3 < "
+    "maxIter);\n"
+    "            tmp = x;\n"
+    "            x = x*x + x0 - y*y;\n"
+    "            y = 2.0f*tmp*y + y0;\n"
+    "            ccount += stay;\n"
+    "            iter--;\n"
+    "            savx.s0 = (stay.s0 ? x.s0 : savx.s0);\n"
+    "            savx.s1 = (stay.s1 ? x.s1 : savx.s1);\n"
+    "            savx.s2 = (stay.s2 ? x.s2 : savx.s2);\n"
+    "            savx.s3 = (stay.s3 ? x.s3 : savx.s3);\n"
+    "            savy.s0 = (stay.s0 ? y.s0 : savy.s0);\n"
+    "            savy.s1 = (stay.s1 ? y.s1 : savy.s1);\n"
+    "            savy.s2 = (stay.s2 ? y.s2 : savy.s2);\n"
+    "            savy.s3 = (stay.s3 ? y.s3 : savy.s3);\n"
+    "        } while ((stay.s0 | stay.s1 | stay.s2 | stay.s3) && iter);\n"
+    "    }\n"
+    "    __global uint4 *vecOut = (__global uint4 *)out;\n"
+    "    vecOut[tid] = convert_uint4(ccount);\n"
+    "}\n";
+
+OCLPerfDeviceConcurrency::OCLPerfDeviceConcurrency() {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+
+    platform = platforms[_platformIndex];
+    num_devices = 0;
+    /* Get the number of requested devices */
+
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    if (num_devices > MAX_DEVICES) {
+      num_devices = MAX_DEVICES;
+    }
+    delete platforms;
+  }
+  _numSubTests = num_devices;
+}
+
+OCLPerfDeviceConcurrency::~OCLPerfDeviceConcurrency() {}
+
+void OCLPerfDeviceConcurrency::setData(cl_mem buffer, unsigned int idx,
+                                       unsigned int val) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_[idx], buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  for (unsigned int i = 0; i < width_; i++) data[i] = val;
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_[idx], buffer, data, 0,
+                                             NULL, NULL);
+  _wrapper->clFinish(cmd_queue_[idx]);
+}
+
+void OCLPerfDeviceConcurrency::checkData(cl_mem buffer, unsigned int idx) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_[idx], buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  totalIters = 0;
+  for (unsigned int i = 0; i < width_; i++) {
+    totalIters += data[i];
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_[idx], buffer, data, 0,
+                                             NULL, NULL);
+  _wrapper->clFinish(cmd_queue_[idx]);
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfDeviceConcurrency::open(unsigned int test, char *units,
+                                    double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  num_devices = 0;
+  cl_device_id *devices = NULL;
+  unsigned int i;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+
+  for (i = 0; i < MAX_DEVICES; i++) {
+    cmd_queue_[i] = 0;
+    program_[i] = 0;
+    kernel_[i] = 0;
+    outBuffer_[i] = 0;
+  }
+
+  // Maximum iteration count
+  // NOTE: Some kernels are unrolled 16 times, so make sure maxIter is divisible
+  // by 16 NOTE: Can increase to get better peak performance numbers, but be
+  // sure not to TDR slow ASICs! NOTE:. for warmup run we use maxIter = 256 and
+  // then for the actual run we use maxIter = 8388608 * (engine_clock / 1000).
+  maxIter = 256;
+
+  // NOTE: Width needs to be divisible by 4 because the float_mandel_vec kernel
+  // processes 4 pixels at once NOTE: Can increase to get better peak
+  // performance numbers, but be sure not to TDR slow ASICs!
+  width_ = 256;
+
+  // We compute a square domain
+  bufSize_ = width_ * sizeof(cl_uint);
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    if (num_devices > MAX_DEVICES) {
+      num_devices = MAX_DEVICES;
+    }
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested devices */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  context_ = _wrapper->clCreateContext(NULL, num_devices, devices,
+                                       notify_callback, NULL, &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cur_devices = _openTest + 1;
+
+  for (i = 0; i < cur_devices; i++) {
+    cmd_queue_[i] =
+        _wrapper->clCreateCommandQueue(context_, devices[i], 0, NULL);
+    CHECK_RESULT(cmd_queue_[i] == 0, "clCreateCommandQueue failed");
+    outBuffer_[i] =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(outBuffer_[i] == 0, "clCreateBuffer(outBuffer) failed");
+  }
+
+  const char *tmp;
+  tmp = float_mandel_vec;
+
+  for (i = 0; i < cur_devices; i++) {
+    program_[i] = _wrapper->clCreateProgramWithSource(
+        context_, 1, (const char **)&tmp, NULL, &error_);
+    CHECK_RESULT(program_[i] == 0, "clCreateProgramWithSource failed");
+
+    error_ =
+        _wrapper->clBuildProgram(program_[i], 1, &devices[i], "", NULL, NULL);
+
+    if (error_ != CL_SUCCESS) {
+      cl_int intError;
+      char log[16384];
+      intError = _wrapper->clGetProgramBuildInfo(
+          program_[i], devices[i], CL_PROGRAM_BUILD_LOG, 16384 * sizeof(char),
+          log, NULL);
+      printf("Build error on device %d -> %s\n", i, log);
+
+      CHECK_RESULT(0, "clBuildProgram failed");
+    }
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    kernel_[i] = _wrapper->clCreateKernel(program_[i], "mandelbrot", &error_);
+    CHECK_RESULT(kernel_[i] == 0, "clCreateKernel failed");
+  }
+
+  coordIdx = _openTest % numCoords;
+  float xStep = (float)(coords[coordIdx].width / (double)width_);
+  float yStep = (float)(-coords[coordIdx].width / (double)width_);
+  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
+  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
+
+  for (i = 0; i < cur_devices; i++) {
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 0, sizeof(cl_mem),
+                                      (void *)&outBuffer_[i]);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 1, sizeof(cl_uint),
+                                      (void *)&width_);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 2, sizeof(cl_float),
+                                      (void *)&xPos);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 3, sizeof(cl_float),
+                                      (void *)&yPos);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 4, sizeof(cl_float),
+                                      (void *)&xStep);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 5, sizeof(cl_float),
+                                      (void *)&yStep);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 6, sizeof(cl_uint),
+                                      (void *)&maxIter);
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    setData(outBuffer_[i], i, 0xdeadbeef);
+  }
+
+  cl_uint clkFrequency = 0;
+  error_ = clGetDeviceInfo(devices[0], CL_DEVICE_MAX_CLOCK_FREQUENCY,
+                           sizeof(clkFrequency), &clkFrequency, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  assert(clkFrequency > 0);
+  maxIter = (unsigned int)(8388608 * ((float)clkFrequency / 1000));
+  maxIter = (maxIter + 15) & ~15;
+}
+
+void OCLPerfDeviceConcurrency::run(void) {
+  int global = width_ >> 2;
+  // We handle 4 pixels per thread
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+  unsigned int i;
+
+  // Warmup
+  for (i = 0; i < cur_devices; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_[i], kernel_[i], 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    _wrapper->clFlush(cmd_queue_[i]);
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    _wrapper->clFinish(cmd_queue_[i]);
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 6, sizeof(cl_uint),
+                                      (void *)&maxIter);
+  }
+
+  CPerfCounter timer;
+
+  timer.Reset();
+  timer.Start();
+
+  for (i = 0; i < cur_devices; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_[i], kernel_[i], 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    _wrapper->clFlush(cmd_queue_[i]);
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    _wrapper->clFinish(cmd_queue_[i]);
+  }
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  unsigned long long expected =
+      (unsigned long long)width_ * (unsigned long long)maxIter;
+
+  for (i = 0; i < cur_devices; i++) {
+    checkData(outBuffer_[i], i);
+    CHECK_RESULT(totalIters != expected, "Incorrect iteration count detected!");
+  }
+
+  _perfInfo = (float)sec;
+  char buf[128];
+  SNPRINTF(buf, sizeof(buf), "time for %2d devices (s) (%2d queues) ",
+           cur_devices, cur_devices);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfDeviceConcurrency::close(void) {
+  unsigned int i;
+
+  for (i = 0; i < cur_devices; i++) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_[i]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    error_ = _wrapper->clReleaseKernel(kernel_[i]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseKernel(kernel_) failed");
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    error_ = _wrapper->clReleaseProgram(program_[i]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseProgram(program_) failed");
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_[i]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
@@ -0,0 +1,60 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_Perf_DeviceConcurrency_H_
+#define _OCL_Perf_DeviceConcurrency_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfDeviceConcurrency : public OCLTestImp {
+ public:
+  OCLPerfDeviceConcurrency();
+  virtual ~OCLPerfDeviceConcurrency();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void setData(cl_mem buffer, unsigned int idx, unsigned int data);
+  void checkData(cl_mem buffer, unsigned int idx);
+
+#define MAX_DEVICES 16
+
+  cl_context context_;
+  cl_command_queue cmd_queue_[MAX_DEVICES];
+  cl_program program_[MAX_DEVICES];
+  cl_kernel kernel_[MAX_DEVICES];
+  cl_mem outBuffer_[MAX_DEVICES];
+  cl_int error_;
+
+  cl_uint num_devices;
+  cl_uint cur_devices;
+
+  unsigned int width_;
+  unsigned int bufSize_;
+  unsigned int maxIter;
+  unsigned int coordIdx;
+  unsigned long long totalIters;
+};
+
+#endif  // _OCL_Perf_DeviceConcurrency_H_
@@ -0,0 +1,227 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfDeviceEnqueue.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define KERNEL_CODE(...) #__VA_ARGS__
+
+typedef struct {
+  unsigned int threads;
+} testStruct;
+
+static testStruct testList[] = {
+    {64}, {128}, {256}, {512}, {1024}, {2048}, {4096},
+};
+
+const static char* strKernel = {KERNEL_CODE(
+    \n __kernel void childKernel(__global uint* buf) {
+  int idx = get_global_id(0);
+  if (idx < 0) {
+    buf[idx] = 0;
+  }
+}
+    \n __kernel void parentKernel(__global uint* buf) {
+  queue_t def_q = get_default_queue();
+  ndrange_t ndrange = ndrange_1D(64, 64);
+  int gid = get_global_id(0);
+
+  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, ^{
+    childKernel(buf);
+  });
+}
+    \n)};
+
+OCLPerfDeviceEnqueue::OCLPerfDeviceEnqueue() {
+  testListSize = sizeof(testList) / sizeof(testStruct);
+  _numSubTests = 7 * testListSize;
+  deviceQueue_ = NULL;
+  failed_ = false;
+  kernel2_ = NULL;
+}
+
+OCLPerfDeviceEnqueue::~OCLPerfDeviceEnqueue() {}
+
+void OCLPerfDeviceEnqueue::open(unsigned int test, char* units,
+                                double& conversion, unsigned int deviceId) {
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  testID_ = test;
+
+  threads = testList[testID_ % testListSize].threads;
+  size_t param_size = 0;
+  char* strVersion = 0;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
+                                     0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strVersion = new char[param_size];
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
+                                     param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strVersion[7] < '2') {
+    failed_ = true;
+    return;
+  }
+  delete strVersion;
+  cl_uint maxDevQSize = 0;
+#if defined(CL_VERSION_2_0)
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId],
+                                     CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE,
+                                     sizeof(cl_uint), &maxDevQSize, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+#endif
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "parentKernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  kernel2_ = _wrapper->clCreateKernel(program_, "childKernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR, 2048, NULL,
+                                    &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+
+  // Hardcoded for us
+  if (testID_ >= testListSize) {
+    queueSize = (1 << (testID_ / testListSize)) * 256 * 1024;
+    queueSize = std::min(queueSize, maxDevQSize);
+    threads *= (1 << (testID_ / testListSize - 1));
+    threads = std::min(threads, queueSize / 128);
+  } else {
+    queueSize = std::max((cl_uint)threads * 128, (cl_uint)16384);
+  }
+
+#if defined(CL_VERSION_2_0)
+  const cl_queue_properties cprops[] = {
+      CL_QUEUE_PROPERTIES,
+      static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+                                       CL_QUEUE_ON_DEVICE_DEFAULT |
+                                       CL_QUEUE_ON_DEVICE),
+      CL_QUEUE_SIZE, queueSize, 0};
+  deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties(
+      context_, devices_[deviceId], cprops, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateCommandQueueWithProperties() failed");
+#endif
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLPerfDeviceEnqueue::run(void) {
+  CPerfCounter timer;
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  if (failed_) return;
+
+  cl_mem buffer = buffers()[0];
+
+  size_t gws[1] = {threads};
+  size_t lws[1] = {64};
+
+  if (gws[0] >= 256) {
+    lws[0] = 256;
+  }
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, lws, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  // Try to normalize the amount of work per test
+  unsigned int repeats = (64 / threads) * 50;
+  if (repeats == 0) repeats = 1;
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < repeats; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, gws, lws, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+    _wrapper->clFinish(cmdQueues_[_deviceId]);
+  }
+  timer.Stop();
+
+  double sec = timer.GetElapsedTime();
+
+  _perfInfo = (float)(threads * repeats) / (float)(sec * 1000000.);
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf),
+           "%7d threads spawning 64 threads, queue size %5dKB (Mdisp/s)",
+           threads, queueSize / 1024);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfDeviceEnqueue::close(void) {
+  // FIXME: Re-enable CPU test once bug 10143 is fixed.
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return 0;
+  }
+
+  if (NULL != deviceQueue_) {
+    _wrapper->clReleaseCommandQueue(deviceQueue_);
+  }
+  if (NULL != kernel2_) {
+    _wrapper->clReleaseKernel(kernel2_);
+  }
+  return OCLTestImp::close();
+}
@@ -0,0 +1,47 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCLPERF_DEVICE_ENQUEUE_H_
+#define _OCLPERF_DEVICE_ENQUEUE_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfDeviceEnqueue : public OCLTestImp {
+ public:
+  OCLPerfDeviceEnqueue();
+  virtual ~OCLPerfDeviceEnqueue();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  cl_command_queue deviceQueue_;
+  bool failed_;
+  unsigned int testID_;
+  cl_kernel kernel2_;
+  unsigned int testListSize;
+  unsigned int threads;
+  cl_uint queueSize;
+};
+
+#endif  // _OCLPERF_DEVICE_ENQUEUE_H_
@@ -0,0 +1,260 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfDeviceEnqueue2.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define KERNEL_CODE(...) #__VA_ARGS__
+
+typedef struct {
+  unsigned int threads;
+} testStruct;
+
+static testStruct testList[] = {
+    {64}, {128}, {256}, {512}, {1024}, {2048}, {4096},
+};
+
+static unsigned int qsizeList[] = {
+    16, 32, 64, 128, 256, 512,
+};
+
+static unsigned int levelList[] = {
+    1,
+    2,
+    4,
+    8,
+};
+
+const static char* strKernel = {KERNEL_CODE(
+  \n __kernel void childKernel(__global uint* buf, uint level) {
+  if (level) {
+    queue_t def_q = get_default_queue();
+    ndrange_t ndrange = ndrange_1D(64, 64);
+    int gid = get_global_id(0);
+    int lid = get_local_id(0);
+    if (lid == 0) {
+      int enq_res =
+          enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, ^{
+            childKernel(buf, level - 1);
+          });
+    }
+  } else {
+    int idx = get_global_id(0);
+    if (idx < 0) {
+      buf[idx] = 0;
+    }
+  }
+}
+  \n __kernel void parentKernel(__global uint* buf, uint level) {
+  queue_t def_q = get_default_queue();
+  ndrange_t ndrange = ndrange_1D(64, 64);
+  int gid = get_global_id(0);
+
+  if (level) {
+    int enq_res =
+        enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, ^{
+          childKernel(buf, level - 1);
+        });
+  }
+}
+  \n)};
+
+OCLPerfDeviceEnqueue2::OCLPerfDeviceEnqueue2() {
+  subTests_level = sizeof(levelList) / sizeof(unsigned int);
+  subTests_qsize = (sizeof(qsizeList) / sizeof(unsigned int));
+  subTests_thread = sizeof(testList) / sizeof(testStruct);
+  testListSize = subTests_thread;
+  _numSubTests = subTests_level * subTests_qsize * subTests_thread;
+  deviceQueue_ = NULL;
+  failed_ = false;
+  kernel2_ = NULL;
+  level = 2;
+  skip_ = false;
+}
+
+OCLPerfDeviceEnqueue2::~OCLPerfDeviceEnqueue2() {}
+
+void OCLPerfDeviceEnqueue2::open(unsigned int test, char* units,
+                                 double& conversion, unsigned int deviceId) {
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  testID_ = test;
+
+  threads = testList[testID_ / (subTests_qsize * subTests_level)].threads;
+  queueSize = qsizeList[(testID_ / subTests_level) % subTests_qsize] * 1024;
+  level = levelList[testID_ % subTests_level];
+
+  size_t param_size = 0;
+  char* strVersion = 0;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
+                                     0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strVersion = new char[param_size];
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
+                                     param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strVersion[7] < '2') {
+    failed_ = true;
+    return;
+  }
+  delete strVersion;
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "parentKernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  kernel2_ = _wrapper->clCreateKernel(program_, "childKernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR, 2048, NULL,
+                                    &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+
+#if defined(CL_VERSION_2_0)
+  const cl_queue_properties cprops[] = {
+      CL_QUEUE_PROPERTIES,
+      static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+                                       CL_QUEUE_ON_DEVICE_DEFAULT |
+                                       CL_QUEUE_ON_DEVICE),
+      CL_QUEUE_SIZE, queueSize, 0};
+  deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties(
+      context_, devices_[deviceId], cprops, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateCommandQueueWithProperties() failed");
+#else
+  skip_ = true;
+  testDescString =
+      "DeviceEnqueue NOT supported for < 2.0 builds. Test Skipped.";
+  return;
+#endif
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLPerfDeviceEnqueue2::run(void) {
+  CPerfCounter timer;
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  if (failed_) {
+    return;
+  }
+
+  if (skip_) {
+    return;
+  }
+
+  cl_mem buffer = buffers()[0];
+
+  size_t gws[1] = {threads};
+  size_t lws[1] = {64};
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(unsigned int), &level);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, lws, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  // Try to normalize the amount of work per test
+  // unsigned int repeats = (4096 / threads) * 100 ;
+  unsigned int repeats = (4096 / threads) * 10;
+  // unsigned int repeats = 100;
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < repeats; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, gws, lws, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+    _wrapper->clFinish(cmdQueues_[_deviceId]);
+  }
+  timer.Stop();
+
+  double sec = timer.GetElapsedTime();
+
+  _perfInfo = (float)(threads * repeats * level) / (float)(sec * 1000000.);
+  char buf[256];
+  SNPRINTF(
+      buf, sizeof(buf),
+      "%5d threads spawning 64 threads, queue size %3dKB (Mdisp/s), level=%2d",
+      threads, queueSize / 1024, level);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfDeviceEnqueue2::close(void) {
+  // FIXME: Re-enable CPU test once bug 10143 is fixed.
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return 0;
+  }
+
+  if (deviceQueue_) {
+    error_ = _wrapper->clReleaseCommandQueue(deviceQueue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (kernel2_) {
+    error_ = _wrapper->clReleaseKernel(kernel2_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  return OCLTestImp::close();
+}
@@ -0,0 +1,54 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCLPERF_DEVICE_ENQUEUE2_H_
+#define _OCLPERF_DEVICE_ENQUEUE2_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfDeviceEnqueue2 : public OCLTestImp {
+ public:
+  OCLPerfDeviceEnqueue2();
+  virtual ~OCLPerfDeviceEnqueue2();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  cl_command_queue deviceQueue_;
+  unsigned int testID_;
+  cl_kernel kernel2_;
+  unsigned int testListSize;
+  unsigned int threads;
+  cl_uint queueSize;
+  unsigned int subTests_level;
+  unsigned int subTests_qsize;
+  unsigned int subTests_thread;
+  unsigned int level;
+  unsigned int lws_value;
+
+  bool failed_;
+  bool skip_;
+};
+
+#endif  // _OCLPERF_DEVICE_ENQUEUE2_H_
@@ -0,0 +1,267 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfDeviceEnqueueEvent.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define KERNEL_CODE(...) #__VA_ARGS__
+
+typedef struct {
+  unsigned int threads;
+} testStruct;
+
+static testStruct testList[] = {
+    {64}, {128}, {256}, {512}, {1024}, {2048}, {4096},
+};
+
+static unsigned int qsizeList[] = {
+    16, 32, 64, 128, 256, 512,
+};
+
+static unsigned int levelList[] = {
+    1,
+    2,
+    4,
+    8,
+};
+
+const static char* strKernel = {KERNEL_CODE(
+  \n __kernel void childKernel(__global uint* buf, uint level,
+                                clk_event_t wait_evt) {
+  int idx = get_global_id(0);
+  if (idx < 0) {
+    buf[idx] = 0;
+  }
+}
+  \n __kernel void parentKernel(__global uint* buf, uint level) {
+  if (level) {
+    queue_t def_q = get_default_queue();
+    ndrange_t ndrange = ndrange_1D(64, 64);
+    clk_event_t user_evt = create_user_event();
+    clk_event_t block_evt, wait_evt;
+    wait_evt = user_evt;
+
+    for (uint i = 0; i < level; i++) {
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 0,
+                                   /*&user_evt*/ NULL, &block_evt, ^{
+                                     childKernel(buf, level - 1, block_evt);
+                                   });
+
+      // wait_evt = block_evt;
+    }
+    if (is_valid_event(user_evt)) {
+      set_user_event_status(user_evt, CL_COMPLETE);
+      release_event(user_evt);
+    }
+  } else {
+    int idx = get_global_id(0);
+    if (idx < 0) {
+      buf[idx] = 0;
+    }
+  }
+}
+  \n)};
+
+OCLPerfDeviceEnqueueEvent::OCLPerfDeviceEnqueueEvent() {
+  subTests_level = sizeof(levelList) / sizeof(unsigned int);
+  subTests_qsize = (sizeof(qsizeList) / sizeof(unsigned int));
+  subTests_thread = sizeof(testList) / sizeof(testStruct);
+  testListSize = subTests_thread;
+  //_numSubTests  = 2*testListSize + subTests_level + subTests_qsize;
+  _numSubTests = subTests_level * subTests_qsize * subTests_thread;
+  deviceQueue_ = NULL;
+  failed_ = false;
+  skip_ = false;
+  kernel2_ = NULL;
+  level = 2;
+}
+
+OCLPerfDeviceEnqueueEvent::~OCLPerfDeviceEnqueueEvent() {}
+
+void OCLPerfDeviceEnqueueEvent::open(unsigned int test, char* units,
+                                     double& conversion,
+                                     unsigned int deviceId) {
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  testID_ = test;
+
+  threads = testList[testID_ / (subTests_qsize * subTests_level)].threads;
+  queueSize = qsizeList[(testID_ / subTests_level) % subTests_qsize] * 1024;
+  level = levelList[testID_ % subTests_level];
+
+  lws_value = 64;
+
+  size_t param_size = 0;
+  char* strVersion = 0;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
+                                     0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strVersion = new char[param_size];
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
+                                     param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strVersion[7] < '2') {
+    failed_ = true;
+    return;
+  }
+  delete strVersion;
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "parentKernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  kernel2_ = _wrapper->clCreateKernel(program_, "childKernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR, 2048, NULL,
+                                    &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+
+#if defined(CL_VERSION_2_0)
+  const cl_queue_properties cprops[] = {
+      CL_QUEUE_PROPERTIES,
+      static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+                                       CL_QUEUE_ON_DEVICE_DEFAULT |
+                                       CL_QUEUE_ON_DEVICE),
+      CL_QUEUE_SIZE, queueSize, 0};
+  deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties(
+      context_, devices_[deviceId], cprops, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateCommandQueueWithProperties() failed");
+#else
+  skip_ = true;
+  testDescString =
+      "DeviceEnqueue NOT supported for < 2.0 builds. Test Skipped.";
+  return;
+#endif
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLPerfDeviceEnqueueEvent::run(void) {
+  CPerfCounter timer;
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  if (failed_) {
+    return;
+  }
+
+  if (skip_) {
+    return;
+  }
+
+  cl_mem buffer = buffers()[0];
+
+  size_t gws[1] = {threads};
+  size_t lws[1] = {lws_value};
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(unsigned int), &level);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, lws, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  // Try to normalize the amount of work per test
+  // unsigned int repeats = (4096 / threads) * 100 ;
+  unsigned int repeats = (4096 / threads) * 10;
+  // unsigned int repeats = 100;
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < repeats; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, gws, lws, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+    _wrapper->clFinish(cmdQueues_[_deviceId]);
+  }
+  timer.Stop();
+
+  double sec = timer.GetElapsedTime();
+
+  _perfInfo = (float)(threads * repeats * level) / (float)(sec * 1000000.);
+  char buf[256];
+  SNPRINTF(
+      buf, sizeof(buf),
+      "%5d threads spawning %2d threads, queue size %3dKB (Mdisp/s), level=%2d",
+      threads, lws_value, queueSize / 1024, level);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfDeviceEnqueueEvent::close(void) {
+  // FIXME: Re-enable CPU test once bug 10143 is fixed.
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return 0;
+  }
+
+  if (deviceQueue_) {
+    error_ = _wrapper->clReleaseCommandQueue(deviceQueue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (kernel2_) {
+    error_ = _wrapper->clReleaseKernel(kernel2_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  return OCLTestImp::close();
+}
@@ -0,0 +1,54 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCLPERF_DEVICE_ENQUEUE_EVENT_H_
+#define _OCLPERF_DEVICE_ENQUEUE_EVENT_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfDeviceEnqueueEvent : public OCLTestImp {
+ public:
+  OCLPerfDeviceEnqueueEvent();
+  virtual ~OCLPerfDeviceEnqueueEvent();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  cl_command_queue deviceQueue_;
+  unsigned int testID_;
+  cl_kernel kernel2_;
+  unsigned int testListSize;
+  unsigned int threads;
+  cl_uint queueSize;
+  unsigned int subTests_level;
+  unsigned int subTests_qsize;
+  unsigned int subTests_thread;
+  unsigned int level;
+  unsigned int lws_value;
+
+  bool failed_;
+  bool skip_;
+};
+
+#endif  // _OCLPERF_DEVICE_ENQUEUE_EVENT_H_
@@ -0,0 +1,233 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfDeviceEnqueueSier.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define KERNEL_CODE(...) #__VA_ARGS__
+
+typedef struct {
+  unsigned int threads;
+} testStruct;
+
+static unsigned int sizeList[] = {
+    81, 243, 729, 2187, 6561, 19683, 59049,
+};
+
+const static char* strKernel = {KERNEL_CODE(
+    \n __kernel void parentKernel(__global uint* buf, int width, int offsetx,
+                                   int offsety) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  queue_t q = get_default_queue();
+
+  int one_third = get_global_size(0) / 3;
+  int two_thirds = 2 * one_third;
+
+  if (x >= one_third && x < two_thirds && y >= one_third && y < two_thirds) {
+    int idx = get_global_id(0);
+    if (idx < 0) {
+      buf[idx] = 0;
+    }
+  } else {
+    if (one_third > 1 && x % one_third == 0 && y % one_third == 0) {
+      const size_t grid[2] = {one_third, one_third};
+      enqueue_kernel(q, 0, ndrange_2D(grid), ^{
+        parentKernel(buf, width, x + offsetx, y + offsety);
+      });
+    }
+  }
+}
+    \n)};
+
+OCLPerfDeviceEnqueueSier::OCLPerfDeviceEnqueueSier() {
+  _numSubTests = sizeof(sizeList) / sizeof(unsigned int);
+  deviceQueue_ = NULL;
+  failed_ = false;
+  skip_ = false;
+}
+
+OCLPerfDeviceEnqueueSier::~OCLPerfDeviceEnqueueSier() {}
+
+void OCLPerfDeviceEnqueueSier::open(unsigned int test, char* units,
+                                    double& conversion, unsigned int deviceId) {
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  testID_ = test;
+
+  size_t param_size = 0;
+  char* strVersion = 0;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
+                                     0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strVersion = new char[param_size];
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
+                                     param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strVersion[7] < '2') {
+    failed_ = true;
+    return;
+  }
+  delete strVersion;
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "parentKernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR, 2048, NULL,
+                                    &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+
+  queueSize = 512 * 1024;
+
+  image_size = sizeList[testID_];
+
+#if defined(CL_VERSION_2_0)
+  const cl_queue_properties cprops[] = {
+      CL_QUEUE_PROPERTIES,
+      static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+                                       CL_QUEUE_ON_DEVICE_DEFAULT |
+                                       CL_QUEUE_ON_DEVICE),
+      CL_QUEUE_SIZE, queueSize, 0};
+  deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties(
+      context_, devices_[deviceId], cprops, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateCommandQueueWithProperties() failed");
+#else
+  skip_ = true;
+  testDescString =
+      "DeviceEnqueue NOT supported for < 2.0 builds. Test Skipped.";
+  return;
+#endif
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLPerfDeviceEnqueueSier::run(void) {
+  CPerfCounter timer;
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  if (failed_) {
+    return;
+  }
+
+  if (skip_) {
+    return;
+  }
+
+  cl_mem buffer = buffers()[0];
+
+  size_t gws[1] = {1};
+  size_t lws[1] = {0};
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  int width = image_size, offsetx = 0, offsety = 0;
+  error_ |= _wrapper->clSetKernelArg(kernel_, 1, sizeof(int), (void*)&width);
+  error_ |= _wrapper->clSetKernelArg(kernel_, 2, sizeof(int), (void*)&offsetx);
+  error_ |= _wrapper->clSetKernelArg(kernel_, 3, sizeof(int), (void*)&offsety);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, 0, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  size_t global_work_size[2] = {image_size, image_size};
+
+  // Try to normalize the amount of work per test
+  unsigned int repeats = 100;
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < repeats; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2,
+                                              NULL, global_work_size, 0, 0,
+                                              NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+    _wrapper->clFinish(cmdQueues_[_deviceId]);
+  }
+  timer.Stop();
+
+  double sec = timer.GetElapsedTime();
+
+  unsigned int numOfKernels = (int)pow(8.0, log(image_size) / log(3) - 1);
+  _perfInfo = (float)(numOfKernels * repeats) / (float)(sec * 1000000.);
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), "image_size = %5d, queue size %3dKB (Mdisp/s)",
+           image_size, queueSize / 1024);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfDeviceEnqueueSier::close(void) {
+  // FIXME: Re-enable CPU test once bug 10143 is fixed.
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return 0;
+  }
+
+  if (deviceQueue_) {
+    error_ = _wrapper->clReleaseCommandQueue(deviceQueue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+
+  return OCLTestImp::close();
+}
@@ -0,0 +1,49 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCLPERF_DEVICE_ENQUEUE_SIER_H_
+#define _OCLPERF_DEVICE_ENQUEUE_SIER_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfDeviceEnqueueSier : public OCLTestImp {
+ public:
+  OCLPerfDeviceEnqueueSier();
+  virtual ~OCLPerfDeviceEnqueueSier();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  cl_command_queue deviceQueue_;
+  unsigned int testID_;
+  unsigned int testListSize;
+  // unsigned int        threads;
+  cl_uint queueSize;
+  unsigned int image_size;
+
+  bool failed_;
+  bool skip_;
+};
+
+#endif  // _OCLPERF_DEVICE_ENQUEUE_SIER_H_
@@ -0,0 +1,391 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfDispatchSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define CHAR_BUF_SIZE 512
+
+typedef struct {
+  unsigned int iterations;
+  int flushEvery;
+} testStruct;
+
+testStruct testList[] = {
+    {1, -1},         {1, -1},      {10, 1},      {10, -1},      {100, 1},
+    {100, 10},       {100, -1},    {1000, 1},    {1000, 10},    {1000, 100},
+    {1000, -1},      {10000, 1},   {10000, 10},  {10000, 100},  {10000, 1000},
+    {10000, -1},     {100000, 1},  {100000, 10}, {100000, 100}, {100000, 1000},
+    {100000, 10000}, {100000, -1},
+};
+
+unsigned int mapTestList[] = {1, 1, 10, 100, 1000, 10000, 100000};
+
+void OCLPerfDispatchSpeed::genShader(void) {
+  shader_.clear();
+  shader_ +=
+      "__kernel void _dispatchSpeed(__global float *outBuf)\n"
+      "{\n"
+      "    int i = (int) get_global_id(0);\n"
+      "    if (i < 0)\n"
+      "        outBuf[i] = 0.0f;\n"
+      "}\n";
+}
+
+OCLPerfDispatchSpeed::OCLPerfDispatchSpeed() {
+  testListSize = sizeof(testList) / sizeof(testStruct);
+  _numSubTests = 2 * 2 * testListSize;
+}
+
+OCLPerfDispatchSpeed::~OCLPerfDispatchSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfDispatchSpeed::open(unsigned int test, char *units,
+                                double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test % testListSize;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  outBuffer_ = 0;
+  sleep = false;
+  doWarmup = false;
+
+  if ((test / testListSize) % 2) {
+    doWarmup = true;
+  }
+  if (test >= (testListSize * 2)) {
+    sleep = true;
+  }
+
+  bufSize_ = 64 * sizeof(cl_float);
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    // if (num_devices > 0)
+    //{
+    //    platform = platforms[_platformIndex];
+    //    break;
+    //}
+#if 0
+        }
+#endif
+    delete platforms;
+  } else {
+    CHECK_RESULT(numPlatforms == 0, "No platforms available!");
+  }
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  genShader();
+  char *tmp = (char *)shader_.c_str();
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&tmp, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &device, "", NULL, NULL);
+
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "_dispatchSpeed", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer_);
+}
+
+void OCLPerfDispatchSpeed::run(void) {
+  int global = bufSize_ / sizeof(cl_float);
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  CPerfCounter timer;
+  cl_event event;
+  cl_int eventStatus;
+
+  if (doWarmup) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, &event);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+    _wrapper->clFinish(cmd_queue_);
+  }
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < testList[_openTest].iterations; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, &event);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+    if ((testList[_openTest].flushEvery > 0) &&
+        (((i + 1) % testList[_openTest].flushEvery) == 0)) {
+      if (sleep) {
+        _wrapper->clFinish(cmd_queue_);
+      } else {
+        _wrapper->clFlush(cmd_queue_);
+        error_ =
+            _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                     sizeof(cl_int), &eventStatus, NULL);
+        while (eventStatus > 0) {
+          error_ =
+              _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                       sizeof(cl_int), &eventStatus, NULL);
+        }
+      }
+    }
+    if (i != (testList[_openTest].iterations - 1)) {
+      _wrapper->clReleaseEvent(event);
+    }
+  }
+  if (sleep) {
+    _wrapper->clFinish(cmd_queue_);
+  } else {
+    _wrapper->clFlush(cmd_queue_);
+    error_ = _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                      sizeof(cl_int), &eventStatus, NULL);
+    while (eventStatus > 0) {
+      error_ =
+          _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                   sizeof(cl_int), &eventStatus, NULL);
+    }
+  }
+  _wrapper->clReleaseEvent(event);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // microseconds per launch
+  double perf = (1000000.f * sec / testList[_openTest].iterations);
+  const char *waitType;
+  const char *extraChar;
+  const char *n;
+  const char *warmup;
+  if (sleep) {
+    waitType = "sleep";
+    extraChar = "";
+    n = "";
+  } else {
+    waitType = "spin";
+    n = "n";
+    extraChar = " ";
+  }
+  if (doWarmup) {
+    warmup = "warmup";
+  } else {
+    warmup = "";
+  }
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  if (testList[_openTest].flushEvery > 0) {
+    SNPRINTF(buf, sizeof(buf),
+             " %7d dispatches %s%sing every %5d %6s (us/disp)",
+             testList[_openTest].iterations, waitType, n,
+             testList[_openTest].flushEvery, warmup);
+  } else {
+    SNPRINTF(buf, sizeof(buf),
+             " %7d dispatches (%s%s)              %6s (us/disp)",
+             testList[_openTest].iterations, waitType, extraChar, warmup);
+  }
+  testDescString = buf;
+}
+
+unsigned int OCLPerfDispatchSpeed::close(void) {
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
+
+OCLPerfMapDispatchSpeed::OCLPerfMapDispatchSpeed() {
+  testListSize = sizeof(mapTestList) / sizeof(unsigned int);
+  _numSubTests = 2 * testListSize;
+}
+
+void OCLPerfMapDispatchSpeed::run(void) {
+  cl_mem outBuffer;
+  outBuffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR,
+                                       bufSize_, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer);
+
+  int global = bufSize_ / sizeof(cl_float);
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  CPerfCounter timer;
+
+  if (doWarmup) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+    _wrapper->clFinish(cmd_queue_);
+  }
+
+  timer.Reset();
+  timer.Start();
+  void *mem;
+  for (unsigned int i = 0; i < mapTestList[_openTest]; i++) {
+    mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer, CL_TRUE,
+                                       CL_MAP_WRITE_INVALIDATE_REGION, 0,
+                                       bufSize_, 0, NULL, NULL, &error_);
+
+    CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+    error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer, mem, 0,
+                                               NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed");
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // microseconds per launch
+  double perf = (1000000.f * sec / mapTestList[_openTest]);
+  const char *warmup;
+  if (doWarmup) {
+    warmup = "warmup";
+  } else {
+    warmup = "";
+  }
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " %7d maps and dispatches %6s (us/disp)",
+           mapTestList[_openTest], warmup);
+  testDescString = buf;
+
+  _wrapper->clReleaseMemObject(outBuffer);
+}
@@ -0,0 +1,58 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_DispatchSpeed_H_
+#define _OCL_DispatchSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfDispatchSpeed : public OCLTestImp {
+ public:
+  OCLPerfDispatchSpeed();
+  virtual ~OCLPerfDispatchSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void genShader(void);
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_mem outBuffer_;
+  cl_int error_;
+  bool doWarmup;
+
+  unsigned int bufSize_;
+  bool sleep;
+  unsigned int testListSize;
+};
+
+class OCLPerfMapDispatchSpeed : public OCLPerfDispatchSpeed {
+ public:
+  OCLPerfMapDispatchSpeed();
+  virtual void run(void);
+};
+#endif  // _OCL_DispatchSpeed_H_
@@ -0,0 +1,442 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfDoubleDMA.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include <cmath>
+#include <sstream>
+#include <string>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+const size_t blockX = 256;
+const size_t blockY = 256;
+const size_t blockZ = 512;
+
+const size_t chunk = 16;
+const size_t size_S = blockX * blockY * blockZ * sizeof(cl_float4);
+const size_t size_s = blockX * blockY * chunk * sizeof(cl_float4);
+static const int WindowWidth = 80;
+
+const size_t MaxQueues = 3;
+bool profEnable = false;
+
+static const char* strKernel =
+    "__kernel void dummy(__global float4* out)  \n"
+    "{                                          \n"
+    "   uint id = get_global_id(0);             \n"
+    "   float4 value = (float4)(1.0f, 2.0f, 3.0f, 4.0f);  \n"
+    "   uint factorial = 1;                     \n"
+    "   for (uint i = 1; i < (id / 0x400); ++i)\n"
+    "   {                                       \n"
+    "       factorial *= i;                     \n"
+    "   }                                       \n"
+    "   out[id] = value * factorial;            \n"
+    "}                                          \n";
+
+class ProfileQueue {
+ public:
+  enum Operation { Write = 0, Execute, Read, Total };
+
+  static const char* OperationName[Total];
+  static const char StartCommand[Total];
+  static const char ExecCommand[Total];
+
+  ProfileQueue() {}
+  ~ProfileQueue() {
+    for (size_t op = 0; op < Total; ++op) {
+      for (size_t idx = 0; idx < events_[op].size(); ++idx) {
+        clReleaseEvent(events_[op][idx]);
+      }
+    }
+  }
+
+  void addEvent(Operation op, cl_event event) { events_[op].push_back(event); }
+
+  void findMinMax(cl_long* min, cl_long* max) {
+    // Find time min/max ranges for the frame scaling
+    for (size_t op = 0; (op < ProfileQueue::Total); ++op) {
+      cl_long time;
+      if (events_[op].size() == 0) continue;
+      clGetEventProfilingInfo(events_[op][0], CL_PROFILING_COMMAND_START,
+                              sizeof(cl_long), &time, NULL);
+      if (0 == *min) {
+        *min = time;
+      } else {
+        *min = std::min(*min, time);
+      }
+      clGetEventProfilingInfo(events_[op][events_[op].size() - 1],
+                              CL_PROFILING_COMMAND_END, sizeof(cl_long), &time,
+                              NULL);
+      if (0 == *max) {
+        *max = time;
+      } else {
+        *max = std::max(*max, time);
+      }
+    }
+  }
+
+  void display(cl_long start, cl_long finish) {
+    std::string graph;
+    graph.resize(WindowWidth + 1);
+    graph[WindowWidth] = '\x0';
+    cl_long timeFrame = finish - start;
+    cl_long interval = timeFrame / WindowWidth;
+
+    // Find time min/max ranges for the frame scaling
+    for (size_t op = 0; (op < Total); ++op) {
+      if (events_[op].size() == 0) continue;
+      cl_long timeStart, timeEnd;
+      int begin = 0, end = 0;
+      for (size_t idx = 0; idx < events_[op].size(); ++idx) {
+        bool cutStart = false;
+        clGetEventProfilingInfo(events_[op][idx], CL_PROFILING_COMMAND_START,
+                                sizeof(cl_long), &timeStart, NULL);
+        clGetEventProfilingInfo(events_[op][idx], CL_PROFILING_COMMAND_END,
+                                sizeof(cl_long), &timeEnd, NULL);
+
+        // Continue if out of the frame scope
+        if (timeStart >= finish) continue;
+        if (timeEnd <= start) continue;
+
+        if (timeStart <= start) {
+          timeStart = start;
+          cutStart = true;
+        }
+
+        if (timeEnd >= finish) {
+          timeEnd = finish;
+        }
+
+        // Readjust time to the frame
+        timeStart -= start;
+        timeEnd -= start;
+        timeStart = static_cast<cl_long>(
+            floor(static_cast<float>(timeStart) / interval + 0.5f));
+        timeEnd = static_cast<cl_long>(
+            floor(static_cast<float>(timeEnd) / interval + 0.5f));
+        begin = static_cast<int>(timeStart);
+        // Idle from end to begin
+        for (int c = end; c < begin; ++c) {
+          graph[c] = '-';
+        }
+        end = static_cast<int>(timeEnd);
+        for (int c = begin; c < end; ++c) {
+          if ((c == begin) && !cutStart) {
+            graph[c] = StartCommand[op];
+          } else {
+            graph[c] = ExecCommand[op];
+          }
+        }
+        if ((begin == end) && (end < WindowWidth)) {
+          graph[begin] = '+';
+        }
+      }
+      if (end < WindowWidth) {
+        for (int c = end; c < WindowWidth; ++c) {
+          graph[c] = '-';
+        }
+      }
+      printf("%s\n", graph.c_str());
+    }
+  }
+
+ private:
+  // Profiling events
+  std::vector<cl_event> events_[Total];
+};
+
+const char* ProfileQueue::OperationName[Total] = {
+    "BufferWrite", "KernelExecution", "BufferRead"};
+const char ProfileQueue::StartCommand[Total] = {'W', 'X', 'R'};
+const char ProfileQueue::ExecCommand[Total] = {'>', '#', '<'};
+
+class Profile {
+ public:
+  Profile(bool profEna, int numQueues)
+      : profileEna_(profEna),
+        numQueues_(numQueues),
+        min_(0),
+        max_(0),
+        execTime_(0) {}
+
+  ~Profile() {}
+
+  void addEvent(int queue, ProfileQueue::Operation op, cl_event event) {
+    if (profileEna_) {
+      profQueue[queue].addEvent(op, event);
+    }
+  }
+
+  cl_long findExecTime() {
+    if (execTime_ != 0) return execTime_;
+    for (int q = 0; q < numQueues_; ++q) {
+      profQueue[q].findMinMax(&min_, &max_);
+    }
+    execTime_ = max_ - min_;
+    return execTime_;
+  }
+
+  void display(cl_long start, cl_long finish) {
+    if (!profileEna_) return;
+    printf("\n ----------- Time frame %.3f (us), scale 1:%.0f\n",
+           (float)(finish - start) / 1000,
+           (float)(finish - start) / (1000 * WindowWidth));
+    for (size_t op = 0; (op < ProfileQueue::Total); ++op) {
+      printf("%s - %c%c; ", ProfileQueue::OperationName[op],
+             ProfileQueue::StartCommand[op], ProfileQueue::ExecCommand[op]);
+    }
+    printf("\n");
+    for (int q = 0; q < numQueues_; ++q) {
+      printf("CommandQueue #%d\n", q);
+      profQueue[q].display(min_ + start, min_ + finish);
+    }
+  }
+
+ private:
+  bool profileEna_;
+  int numQueues_;     //!< Total number of queues
+  cl_long min_;       //!< Min HW timestamp
+  cl_long max_;       //!< Max HW timestamp
+  cl_long execTime_;  //!< Profile time
+  ProfileQueue profQueue[MaxQueues];
+};
+
+OCLPerfDoubleDMA::OCLPerfDoubleDMA() {
+  _numSubTests = 2 * MaxQueues * 2;
+  failed_ = false;
+}
+
+OCLPerfDoubleDMA::~OCLPerfDoubleDMA() {}
+
+void OCLPerfDoubleDMA::open(unsigned int test, char* units, double& conversion,
+                            unsigned int deviceId) {
+  _deviceId = deviceId;
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  test_ = test;
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    printf("GPU device is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+  kernel_ = _wrapper->clCreateKernel(program_, "dummy", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  size_t bufSize = size_s;
+  cl_mem buffer;
+  if (test_ >= (2 * MaxQueues)) {
+    profEnable = true;
+  }
+  test_ %= 2 * MaxQueues;
+  size_t numBufs = (test_ % MaxQueues) + 1;
+  for (size_t b = 0; b < numBufs; ++b) {
+    buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, bufSize,
+                                      NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+    buffers_.push_back(buffer);
+  }
+
+  buffer = _wrapper->clCreateBuffer(context_,
+                                    CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                                    size_S, NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLPerfDoubleDMA::run(void) {
+  if (failed_) {
+    return;
+  }
+  CPerfCounter timer;
+  const int numQueues = (test_ % MaxQueues) + 1;
+  const bool useKernel = ((test_ / MaxQueues) > 0);
+  const int numBufs = numQueues;
+  Profile profile(profEnable, numQueues);
+
+  std::vector<cl_command_queue> cmdQueues(numQueues);
+  int q;
+  cl_command_queue_properties qProp =
+      (profEnable) ? CL_QUEUE_PROFILING_ENABLE : 0;
+  for (q = 0; q < numQueues; ++q) {
+    cl_command_queue cmdQueue = _wrapper->clCreateCommandQueue(
+        context_, devices_[_deviceId], qProp, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed");
+    cmdQueues[q] = cmdQueue;
+  }
+
+  float* Data_s = (float*)_wrapper->clEnqueueMapBuffer(
+      cmdQueues[0], buffers_[numBufs], CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
+      size_S, 0, NULL, NULL, &error_);
+
+  size_t gws[1] = {size_s / (4 * sizeof(float))};
+  size_t lws[1] = {256};
+
+  // Warm-up
+  for (q = 0; q < numQueues; ++q) {
+    error_ |=
+        _wrapper->clEnqueueWriteBuffer(cmdQueues[q], buffers_[q], CL_FALSE, 0,
+                                       size_s, (char*)Data_s, 0, NULL, NULL);
+    error_ |= _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                       (void*)&buffers_[q]);
+    error_ |= _wrapper->clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL,
+                                               gws, lws, 0, NULL, NULL);
+    error_ |=
+        _wrapper->clEnqueueReadBuffer(cmdQueues[q], buffers_[q], CL_FALSE, 0,
+                                      size_s, (char*)Data_s, 0, NULL, NULL);
+    error_ |= _wrapper->clFinish(cmdQueues[q]);
+  }
+
+  CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "Execution failed");
+
+  size_t s_done = 0;
+  cl_event r[MaxQueues] = {0}, w[MaxQueues] = {0}, x[MaxQueues] = {0};
+
+  /*----------  pass2:  copy Data_s to and from GPU Buffers ----------*/
+  s_done = 0;
+  timer.Reset();
+  timer.Start();
+  int idx = numBufs - 1;
+  // Start from the last so read/write won't go to the same DMA when kernel is
+  // executed
+  q = numQueues - 1;
+  size_t iter = 0;
+  while (1) {
+    if (0 == r[idx]) {
+      error_ |= _wrapper->clEnqueueWriteBuffer(
+          cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s,
+          (char*)Data_s + s_done, 0, NULL, &w[idx]);
+    } else {
+      error_ |= _wrapper->clEnqueueWriteBuffer(
+          cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s,
+          (char*)Data_s + s_done, 1, &r[idx], &w[idx]);
+      if (!profEnable) {
+        error_ |= _wrapper->clReleaseEvent(r[idx]);
+      }
+    }
+    _wrapper->clFlush(cmdQueues[q]);
+    profile.addEvent(q, ProfileQueue::Write, w[idx]);
+
+    if (useKernel) {
+      // Change the queue
+      ++q %= numQueues;
+      // Implicit flush of DMA engine on kernel start, because memory dependency
+      error_ |= _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                         (void*)&buffers_[idx]);
+      error_ |= _wrapper->clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL,
+                                                 gws, lws, 1, &w[idx], &x[idx]);
+      if (!profEnable) {
+        error_ |= _wrapper->clReleaseEvent(w[idx]);
+      }
+      profile.addEvent(q, ProfileQueue::Execute, x[idx]);
+    }
+    _wrapper->clFlush(cmdQueues[q]);
+
+    // Change the queue
+    ++q %= numQueues;
+    error_ |= _wrapper->clEnqueueReadBuffer(
+        cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s,
+        (char*)Data_s + s_done, 1, (useKernel) ? &x[idx] : &w[idx], &r[idx]);
+    if (!profEnable) {
+      error_ |= _wrapper->clReleaseEvent((useKernel) ? x[idx] : w[idx]);
+    }
+    profile.addEvent(q, ProfileQueue::Read, r[idx]);
+    _wrapper->clFlush(cmdQueues[q]);
+
+    if ((s_done += size_s) >= size_S) {
+      if (!profEnable) {
+        error_ |= _wrapper->clReleaseEvent(r[idx]);
+      }
+      break;
+    }
+    ++iter;
+    ++idx %= numBufs;
+    ++q %= numQueues;
+  }
+
+  for (q = 0; q < numQueues; ++q) {
+    error_ |= _wrapper->clFinish(cmdQueues[q]);
+  }
+  timer.Stop();
+
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues[0], buffers_[numBufs],
+                                             Data_s, 0, NULL, NULL);
+
+  error_ |= _wrapper->clFinish(cmdQueues[0]);
+  CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "Execution failed");
+
+  cl_long gpuTimeFrame = profile.findExecTime();
+  cl_long oneIter = gpuTimeFrame / iter;
+
+  // Display 4 iterations in the middle
+  cl_long startFrame = oneIter * (iter / 2 - 2);
+  cl_long finishFrame = oneIter * (iter / 2 + 2);
+  profile.display(startFrame, finishFrame);
+
+  for (q = 0; q < numQueues; ++q) {
+    error_ = _wrapper->clReleaseCommandQueue(cmdQueues[q]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseCommandQueue() failed");
+  }
+
+  double GBytes = (double)(2 * size_S) / (double)(1000 * 1000 * 1000);
+  _perfInfo = static_cast<float>(GBytes / timer.GetElapsedTime());
+
+  std::stringstream stream;
+  if (useKernel) {
+    stream << "Write/Kernel/Read operation ";
+  } else {
+    stream << "Write/Read operation ";
+  }
+  stream << numQueues << " queues; profiling "
+         << ((profEnable) ? "enabled" : "disabled") << " [GB/s]";
+
+  stream.flags(std::ios::right | std::ios::showbase);
+  testDescString = stream.str();
+}
+
+unsigned int OCLPerfDoubleDMA::close(void) { return OCLTestImp::close(); }
@@ -0,0 +1,42 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PERF_DOUBLE_DMA_H_
+#define _OCL_PERF_DOUBLE_DMA_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfDoubleDMA : public OCLTestImp {
+ public:
+  OCLPerfDoubleDMA();
+  virtual ~OCLPerfDoubleDMA();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+  unsigned int test_;
+};
+
+#endif  // _OCL_PERF_DOUBLE_DMA_H_
--- a/Show More
+++ b/Show More