From 18ce996fe2a5ff9c2cdec3da3a3001fac190a875 Mon Sep 17 00:00:00 2001
From: Vlad Sytchenko <vladislav.sytchenko@amd.com>
Date: Fri, 29 May 2020 12:10:04 -0400
Subject: [PATCH] Initial source drop of ocltst

This only adds source files for ocltst and the following test modules - oclruntime, oclperf, oclgl, ocldx. There's no build files for now.

Change-Id: I0f8d9d074c45d82e92f7d30bf22753102f272f4f


[ROCm/clr commit: 75e6add24d0a46e7fe5325e23ad9d5721365f036]
---
 projects/clr/opencl/tests/ocltst/env/Module.h |   54 +
 .../opencl/tests/ocltst/env/ResultStruct.h    |   71 +
 .../clr/opencl/tests/ocltst/env/Timer.cpp     |  111 ++
 projects/clr/opencl/tests/ocltst/env/Timer.h  |   46 +
 projects/clr/opencl/tests/ocltst/env/Worker.h |  180 ++
 .../opencl/tests/ocltst/env/oclsysinfo.cpp    |  162 ++
 .../clr/opencl/tests/ocltst/env/oclsysinfo.h  |   28 +
 .../clr/opencl/tests/ocltst/env/ocltst.cpp    | 1611 +++++++++++++++++
 projects/clr/opencl/tests/ocltst/env/pfm.cpp  |   79 +
 projects/clr/opencl/tests/ocltst/env/pfm.h    |   28 +
 .../opencl/tests/ocltst/include/OCL/Thread.h  |  148 ++
 .../clr/opencl/tests/ocltst/include/OCLLog.h  |   47 +
 .../clr/opencl/tests/ocltst/include/OCLTest.h |   73 +
 .../opencl/tests/ocltst/include/OCLTestList.h |   43 +
 .../tests/ocltst/include/OCLTestUtils.h       |   31 +
 .../opencl/tests/ocltst/include/OCLWrapper.h  |  614 +++++++
 .../opencl/tests/ocltst/log/oclTestLog.cpp    |  104 ++
 .../clr/opencl/tests/ocltst/log/oclTestLog.h  |   44 +
 .../ocltst/module/common/BaseTestImp.cpp      |  185 ++
 .../ocltst/module/common/OCLGLCommon.cpp      |  175 ++
 .../tests/ocltst/module/common/OCLGLCommon.h  |   80 +
 .../ocltst/module/common/OCLGLCommonLinux.cpp |  239 +++
 .../module/common/OCLGLCommonWindows.cpp      |  239 +++
 .../tests/ocltst/module/common/OCLTestImp.cpp |  288 +++
 .../ocltst/module/common/OCLTestListImp.cpp   |   70 +
 .../ocltst/module/common/OCLTestUtils.cpp     |   46 +
 .../tests/ocltst/module/common/OCLThread.cpp  |  209 +++
 .../tests/ocltst/module/common/OCLWrapper.cpp |  944 ++++++++++
 .../tests/ocltst/module/common/Timer.cpp      |  112 ++
 .../opencl/tests/ocltst/module/common/Timer.h |   46 +
 .../tests/ocltst/module/dx/OCLDX11Common.cpp  |  236 +++
 .../tests/ocltst/module/dx/OCLDX11Common.h    |   68 +
 .../tests/ocltst/module/dx/OCLDX11YUY2.cpp    |  478 +++++
 .../tests/ocltst/module/dx/OCLDX11YUY2.h      |   56 +
 .../tests/ocltst/module/dx/TestList.cpp       |   52 +
 .../tests/ocltst/module/dx/ocldx.exclude      |    1 +
 .../tests/ocltst/module/gl/OCLGLBuffer.cpp    |  220 +++
 .../tests/ocltst/module/gl/OCLGLBuffer.h      |   42 +
 .../module/gl/OCLGLBufferMultipleQueues.cpp   |  303 ++++
 .../module/gl/OCLGLBufferMultipleQueues.h     |   48 +
 .../ocltst/module/gl/OCLGLDepthBuffer.cpp     |  270 +++
 .../tests/ocltst/module/gl/OCLGLDepthBuffer.h |   66 +
 .../tests/ocltst/module/gl/OCLGLDepthTex.cpp  |  278 +++
 .../tests/ocltst/module/gl/OCLGLDepthTex.h    |   62 +
 .../tests/ocltst/module/gl/OCLGLFenceSync.cpp |  481 +++++
 .../tests/ocltst/module/gl/OCLGLFenceSync.h   |   55 +
 .../ocltst/module/gl/OCLGLMsaaTexture.cpp     |  298 +++
 .../tests/ocltst/module/gl/OCLGLMsaaTexture.h |   68 +
 .../ocltst/module/gl/OCLGLMultiContext.cpp    |  231 +++
 .../ocltst/module/gl/OCLGLMultiContext.h      |   54 +
 .../tests/ocltst/module/gl/OCLGLTexture.cpp   |  144 ++
 .../tests/ocltst/module/gl/OCLGLTexture.h     |  214 +++
 .../tests/ocltst/module/gl/TestList.cpp       |   54 +
 .../tests/ocltst/module/gl/oclgl.exclude      |    1 +
 .../tests/ocltst/module/include/BaseTestImp.h |  206 +++
 .../tests/ocltst/module/include/OCLTestImp.h  |   83 +
 .../ocltst/module/include/OCLTestListImp.h    |   86 +
 .../tests/ocltst/module/include/OclIncludes.h |   32 +
 .../module/perf/OCLPerf3DImageWriteSpeed.cpp  |  211 +++
 .../module/perf/OCLPerf3DImageWriteSpeed.h    |   49 +
 .../ocltst/module/perf/OCLPerfAES256.cpp      |  451 +++++
 .../tests/ocltst/module/perf/OCLPerfAES256.h  |   58 +
 .../ocltst/module/perf/OCLPerfAtomicSpeed.cpp |  817 +++++++++
 .../ocltst/module/perf/OCLPerfAtomicSpeed.h   |  119 ++
 .../module/perf/OCLPerfAtomicSpeed20.cpp      |  509 ++++++
 .../ocltst/module/perf/OCLPerfAtomicSpeed20.h |  102 ++
 .../module/perf/OCLPerfAtomicSpeed20Kernels.h |   73 +
 .../module/perf/OCLPerfAtomicSpeedKernels.h   |  402 ++++
 .../module/perf/OCLPerfBufferCopyOverhead.cpp |  254 +++
 .../module/perf/OCLPerfBufferCopyOverhead.h   |   50 +
 .../module/perf/OCLPerfBufferCopySpeed.cpp    |  439 +++++
 .../module/perf/OCLPerfBufferCopySpeed.h      |   65 +
 .../module/perf/OCLPerfBufferReadSpeed.cpp    |  334 ++++
 .../module/perf/OCLPerfBufferReadSpeed.h      |   65 +
 .../module/perf/OCLPerfBufferWriteSpeed.cpp   |  333 ++++
 .../module/perf/OCLPerfBufferWriteSpeed.h     |   65 +
 .../ocltst/module/perf/OCLPerfCPUMemSpeed.cpp |  304 ++++
 .../ocltst/module/perf/OCLPerfCPUMemSpeed.h   |   59 +
 .../module/perf/OCLPerfCommandQueue.cpp       |  146 ++
 .../ocltst/module/perf/OCLPerfCommandQueue.h  |   42 +
 .../ocltst/module/perf/OCLPerfConcurrency.cpp |  563 ++++++
 .../ocltst/module/perf/OCLPerfConcurrency.h   |   63 +
 .../module/perf/OCLPerfDevMemReadSpeed.cpp    |  243 +++
 .../module/perf/OCLPerfDevMemReadSpeed.h      |   47 +
 .../module/perf/OCLPerfDevMemWriteSpeed.cpp   |  212 +++
 .../module/perf/OCLPerfDevMemWriteSpeed.h     |   46 +
 .../module/perf/OCLPerfDeviceConcurrency.cpp  |  480 +++++
 .../module/perf/OCLPerfDeviceConcurrency.h    |   60 +
 .../module/perf/OCLPerfDeviceEnqueue.cpp      |  227 +++
 .../ocltst/module/perf/OCLPerfDeviceEnqueue.h |   47 +
 .../module/perf/OCLPerfDeviceEnqueue2.cpp     |  260 +++
 .../module/perf/OCLPerfDeviceEnqueue2.h       |   54 +
 .../module/perf/OCLPerfDeviceEnqueueEvent.cpp |  267 +++
 .../module/perf/OCLPerfDeviceEnqueueEvent.h   |   54 +
 .../module/perf/OCLPerfDeviceEnqueueSier.cpp  |  233 +++
 .../module/perf/OCLPerfDeviceEnqueueSier.h    |   49 +
 .../module/perf/OCLPerfDispatchSpeed.cpp      |  391 ++++
 .../ocltst/module/perf/OCLPerfDispatchSpeed.h |   58 +
 .../ocltst/module/perf/OCLPerfDoubleDMA.cpp   |  442 +++++
 .../ocltst/module/perf/OCLPerfDoubleDMA.h     |   42 +
 .../module/perf/OCLPerfDoubleDMASeq.cpp       |  291 +++
 .../ocltst/module/perf/OCLPerfDoubleDMASeq.h  |   43 +
 .../ocltst/module/perf/OCLPerfFillBuffer.cpp  |  114 ++
 .../ocltst/module/perf/OCLPerfFillBuffer.h    |   48 +
 .../ocltst/module/perf/OCLPerfFillImage.cpp   |  109 ++
 .../ocltst/module/perf/OCLPerfFillImage.h     |   45 +
 .../tests/ocltst/module/perf/OCLPerfFlush.cpp |  165 ++
 .../tests/ocltst/module/perf/OCLPerfFlush.h   |   42 +
 .../module/perf/OCLPerfGenericBandwidth.cpp   |  309 ++++
 .../module/perf/OCLPerfGenericBandwidth.h     |   57 +
 .../module/perf/OCLPerfGenoilSiaMiner.cpp     |  429 +++++
 .../module/perf/OCLPerfGenoilSiaMiner.h       |   78 +
 .../module/perf/OCLPerfImageCopyCorners.cpp   |  367 ++++
 .../module/perf/OCLPerfImageCopyCorners.h     |   55 +
 .../module/perf/OCLPerfImageCopySpeed.cpp     |  344 ++++
 .../module/perf/OCLPerfImageCopySpeed.h       |   56 +
 .../ocltst/module/perf/OCLPerfImageCreate.cpp |  194 ++
 .../ocltst/module/perf/OCLPerfImageCreate.h   |   51 +
 .../module/perf/OCLPerfImageMapUnmap.cpp      |  333 ++++
 .../ocltst/module/perf/OCLPerfImageMapUnmap.h |   57 +
 .../module/perf/OCLPerfImageReadSpeed.cpp     |  295 +++
 .../module/perf/OCLPerfImageReadSpeed.h       |   61 +
 .../module/perf/OCLPerfImageReadWrite.cpp     |  223 +++
 .../module/perf/OCLPerfImageReadWrite.h       |   51 +
 .../module/perf/OCLPerfImageReadsRGBA.cpp     |  236 +++
 .../module/perf/OCLPerfImageReadsRGBA.h       |   52 +
 .../module/perf/OCLPerfImageSampleRate.cpp    |  324 ++++
 .../module/perf/OCLPerfImageSampleRate.h      |   58 +
 .../module/perf/OCLPerfImageWriteSpeed.cpp    |  317 ++++
 .../module/perf/OCLPerfImageWriteSpeed.h      |   62 +
 .../module/perf/OCLPerfKernelArguments.cpp    |  239 +++
 .../module/perf/OCLPerfKernelArguments.h      |   43 +
 .../module/perf/OCLPerfKernelThroughput.cpp   | 1008 +++++++++++
 .../module/perf/OCLPerfKernelThroughput.h     |  118 ++
 .../ocltst/module/perf/OCLPerfLDSLatency.cpp  |  432 +++++
 .../ocltst/module/perf/OCLPerfLDSLatency.h    |   59 +
 .../module/perf/OCLPerfLDSReadSpeed.cpp       |  395 ++++
 .../ocltst/module/perf/OCLPerfLDSReadSpeed.h  |   59 +
 .../ocltst/module/perf/OCLPerfMandelbrot.cpp  |  940 ++++++++++
 .../ocltst/module/perf/OCLPerfMandelbrot.h    |   75 +
 .../module/perf/OCLPerfMapBufferReadSpeed.cpp |  262 +++
 .../module/perf/OCLPerfMapBufferReadSpeed.h   |   56 +
 .../perf/OCLPerfMapBufferWriteSpeed.cpp       |  291 +++
 .../module/perf/OCLPerfMapBufferWriteSpeed.h  |   58 +
 .../module/perf/OCLPerfMapImageReadSpeed.cpp  |  213 +++
 .../module/perf/OCLPerfMapImageReadSpeed.h    |   49 +
 .../module/perf/OCLPerfMapImageWriteSpeed.cpp |  214 +++
 .../module/perf/OCLPerfMapImageWriteSpeed.h   |   49 +
 .../module/perf/OCLPerfMatrixTranspose.cpp    |  326 ++++
 .../module/perf/OCLPerfMatrixTranspose.h      |   57 +
 .../ocltst/module/perf/OCLPerfMemCombine.cpp  |  234 +++
 .../ocltst/module/perf/OCLPerfMemCombine.h    |   56 +
 .../ocltst/module/perf/OCLPerfMemCreate.cpp   |  176 ++
 .../ocltst/module/perf/OCLPerfMemCreate.h     |   43 +
 .../ocltst/module/perf/OCLPerfMemLatency.cpp  |  418 +++++
 .../ocltst/module/perf/OCLPerfMemLatency.h    |   61 +
 .../perf/OCLPerfPinnedBufferReadSpeed.cpp     |  347 ++++
 .../perf/OCLPerfPinnedBufferReadSpeed.h       |   66 +
 .../perf/OCLPerfPinnedBufferWriteSpeed.cpp    |  342 ++++
 .../perf/OCLPerfPinnedBufferWriteSpeed.h      |   66 +
 .../module/perf/OCLPerfPipeCopySpeed.cpp      |  504 ++++++
 .../ocltst/module/perf/OCLPerfPipeCopySpeed.h |   60 +
 .../module/perf/OCLPerfProgramGlobalRead.cpp  |  549 ++++++
 .../module/perf/OCLPerfProgramGlobalRead.h    |   60 +
 .../module/perf/OCLPerfProgramGlobalWrite.cpp |  384 ++++
 .../module/perf/OCLPerfProgramGlobalWrite.h   |   58 +
 .../ocltst/module/perf/OCLPerfSHA256.cpp      |  841 +++++++++
 .../tests/ocltst/module/perf/OCLPerfSHA256.h  |   58 +
 .../ocltst/module/perf/OCLPerfSVMAlloc.cpp    |  263 +++
 .../ocltst/module/perf/OCLPerfSVMAlloc.h      |   46 +
 .../module/perf/OCLPerfSVMKernelArguments.cpp |  255 +++
 .../module/perf/OCLPerfSVMKernelArguments.h   |   47 +
 .../ocltst/module/perf/OCLPerfSVMMap.cpp      |  153 ++
 .../tests/ocltst/module/perf/OCLPerfSVMMap.h  |   44 +
 .../ocltst/module/perf/OCLPerfSVMMemFill.cpp  |  214 +++
 .../ocltst/module/perf/OCLPerfSVMMemFill.h    |   50 +
 .../ocltst/module/perf/OCLPerfSVMMemcpy.cpp   |  216 +++
 .../ocltst/module/perf/OCLPerfSVMMemcpy.h     |   47 +
 .../module/perf/OCLPerfSVMSampleRate.cpp      |  359 ++++
 .../ocltst/module/perf/OCLPerfSVMSampleRate.h |   63 +
 .../ocltst/module/perf/OCLPerfSampleRate.cpp  |  336 ++++
 .../ocltst/module/perf/OCLPerfSampleRate.h    |   60 +
 .../perf/OCLPerfScalarReplArrayElem.cpp       |  325 ++++
 .../module/perf/OCLPerfScalarReplArrayElem.h  |   60 +
 .../ocltst/module/perf/OCLPerfSdiP2PCopy.cpp  |  261 +++
 .../ocltst/module/perf/OCLPerfSdiP2PCopy.h    |   52 +
 .../tests/ocltst/module/perf/OCLPerfSepia.cpp |  586 ++++++
 .../tests/ocltst/module/perf/OCLPerfSepia.h   |   58 +
 .../module/perf/OCLPerfTextureMemLatency.cpp  |  409 +++++
 .../module/perf/OCLPerfTextureMemLatency.h    |   60 +
 .../module/perf/OCLPerfUAVReadSpeed.cpp       |  630 +++++++
 .../ocltst/module/perf/OCLPerfUAVReadSpeed.h  |   63 +
 .../perf/OCLPerfUAVReadSpeedHostMem.cpp       |  437 +++++
 .../module/perf/OCLPerfUAVReadSpeedHostMem.h  |   63 +
 .../perf/OCLPerfUAVWriteSpeedHostMem.cpp      |  380 ++++
 .../module/perf/OCLPerfUAVWriteSpeedHostMem.h |   58 +
 .../module/perf/OCLPerfUncoalescedRead.cpp    |  270 +++
 .../module/perf/OCLPerfUncoalescedRead.h      |   44 +
 .../module/perf/OCLPerfVerticalFetch.cpp      |  353 ++++
 .../ocltst/module/perf/OCLPerfVerticalFetch.h |   49 +
 .../tests/ocltst/module/perf/TestList.cpp     |  191 ++
 .../tests/ocltst/module/perf/oclperf.exclude  |   28 +
 .../ocltst/module/runtime/OCLAsyncMap.cpp     |   98 +
 .../tests/ocltst/module/runtime/OCLAsyncMap.h |   38 +
 .../module/runtime/OCLAsyncTransfer.cpp       |  139 ++
 .../ocltst/module/runtime/OCLAsyncTransfer.h  |   38 +
 .../module/runtime/OCLAtomicCounter.cpp       |  168 ++
 .../ocltst/module/runtime/OCLAtomicCounter.h  |   41 +
 .../ocltst/module/runtime/OCLBlitKernel.cpp   |  612 +++++++
 .../ocltst/module/runtime/OCLBlitKernel.h     |   41 +
 .../module/runtime/OCLBufferFromImage.cpp     |  289 +++
 .../module/runtime/OCLBufferFromImage.h       |   57 +
 .../module/runtime/OCLCPUGuardPages.cpp       |  178 ++
 .../ocltst/module/runtime/OCLCPUGuardPages.h  |   49 +
 .../ocltst/module/runtime/OCLCreateBuffer.cpp |  173 ++
 .../ocltst/module/runtime/OCLCreateBuffer.h   |   47 +
 .../module/runtime/OCLCreateContext.cpp       |   98 +
 .../ocltst/module/runtime/OCLCreateContext.h  |   38 +
 .../ocltst/module/runtime/OCLCreateImage.cpp  |  493 +++++
 .../ocltst/module/runtime/OCLCreateImage.h    |   48 +
 .../ocltst/module/runtime/OCLDeviceAtomic.cpp |  210 +++
 .../ocltst/module/runtime/OCLDeviceAtomic.h   |   44 +
 .../module/runtime/OCLDeviceQueries.cpp       |  288 +++
 .../ocltst/module/runtime/OCLDeviceQueries.h  |   41 +
 .../ocltst/module/runtime/OCLDynamic.cpp      |  225 +++
 .../tests/ocltst/module/runtime/OCLDynamic.h  |   43 +
 .../module/runtime/OCLDynamicBLines.cpp       |  357 ++++
 .../ocltst/module/runtime/OCLDynamicBLines.h  |   54 +
 .../module/runtime/OCLGenericAddressSpace.cpp |  815 +++++++++
 .../module/runtime/OCLGenericAddressSpace.h   |   50 +
 .../module/runtime/OCLGetQueueThreadID.cpp    |  116 ++
 .../module/runtime/OCLGetQueueThreadID.h      |   41 +
 .../ocltst/module/runtime/OCLGlobalOffset.cpp |  126 ++
 .../ocltst/module/runtime/OCLGlobalOffset.h   |   38 +
 .../module/runtime/OCLImage2DFromBuffer.cpp   |  389 ++++
 .../module/runtime/OCLImage2DFromBuffer.h     |   56 +
 .../module/runtime/OCLImageCopyPartial.cpp    |  347 ++++
 .../module/runtime/OCLImageCopyPartial.h      |   57 +
 .../ocltst/module/runtime/OCLKernelBinary.cpp |  252 +++
 .../ocltst/module/runtime/OCLKernelBinary.h   |   38 +
 .../tests/ocltst/module/runtime/OCLLDS32K.cpp |  371 ++++
 .../tests/ocltst/module/runtime/OCLLDS32K.h   |   51 +
 .../ocltst/module/runtime/OCLLinearFilter.cpp |  187 ++
 .../ocltst/module/runtime/OCLLinearFilter.h   |   38 +
 .../ocltst/module/runtime/OCLLiquidFlash.cpp  |  264 +++
 .../ocltst/module/runtime/OCLLiquidFlash.h    |   57 +
 .../ocltst/module/runtime/OCLMapCount.cpp     |   98 +
 .../tests/ocltst/module/runtime/OCLMapCount.h |   60 +
 .../module/runtime/OCLMemDependency.cpp       |  153 ++
 .../ocltst/module/runtime/OCLMemDependency.h  |   38 +
 .../ocltst/module/runtime/OCLMemObjs.cpp      |  139 ++
 .../tests/ocltst/module/runtime/OCLMemObjs.h  |   45 +
 .../ocltst/module/runtime/OCLMemoryInfo.cpp   |  200 ++
 .../ocltst/module/runtime/OCLMemoryInfo.h     |   42 +
 .../ocltst/module/runtime/OCLMultiQueue.cpp   |  295 +++
 .../ocltst/module/runtime/OCLMultiQueue.h     |   43 +
 .../module/runtime/OCLOfflineCompilation.cpp  |  206 +++
 .../module/runtime/OCLOfflineCompilation.h    |   38 +
 .../ocltst/module/runtime/OCLP2PBuffer.cpp    |  286 +++
 .../ocltst/module/runtime/OCLP2PBuffer.h      |   56 +
 .../module/runtime/OCLPartialWrkgrp.cpp       |  292 +++
 .../ocltst/module/runtime/OCLPartialWrkgrp.h  |   41 +
 .../ocltst/module/runtime/OCLPerfCounters.cpp |  798 ++++++++
 .../ocltst/module/runtime/OCLPerfCounters.h   |   50 +
 .../ocltst/module/runtime/OCLPersistent.cpp   |  139 ++
 .../ocltst/module/runtime/OCLPersistent.h     |   50 +
 .../ocltst/module/runtime/OCLPinnedMemory.cpp |  218 +++
 .../ocltst/module/runtime/OCLPinnedMemory.h   |   51 +
 .../module/runtime/OCLPlatformAtomics.cpp     |  182 ++
 .../module/runtime/OCLPlatformAtomics.h       |   41 +
 .../runtime/OCLProgramScopeVariables.cpp      |  274 +++
 .../module/runtime/OCLProgramScopeVariables.h |   46 +
 .../ocltst/module/runtime/OCLRTQueue.cpp      |  415 +++++
 .../tests/ocltst/module/runtime/OCLRTQueue.h  |   48 +
 .../module/runtime/OCLReadWriteImage.cpp      |  372 ++++
 .../ocltst/module/runtime/OCLReadWriteImage.h |   50 +
 .../tests/ocltst/module/runtime/OCLSDI.cpp    |  515 ++++++
 .../tests/ocltst/module/runtime/OCLSDI.h      |   65 +
 .../tests/ocltst/module/runtime/OCLSVM.cpp    |  612 +++++++
 .../tests/ocltst/module/runtime/OCLSVM.h      |   64 +
 .../ocltst/module/runtime/OCLSemaphore.cpp    |  225 +++
 .../ocltst/module/runtime/OCLSemaphore.h      |   39 +
 .../ocltst/module/runtime/OCLStablePState.cpp |  129 ++
 .../ocltst/module/runtime/OCLStablePState.h   |   41 +
 .../ocltst/module/runtime/OCLThreadTrace.cpp  |  344 ++++
 .../ocltst/module/runtime/OCLThreadTrace.h    |   71 +
 .../module/runtime/OCLUnalignedCopy.cpp       |  127 ++
 .../ocltst/module/runtime/OCLUnalignedCopy.h  |   41 +
 .../tests/ocltst/module/runtime/TestList.cpp  |  129 ++
 .../ocltst/module/runtime/oclruntime.exclude  |    7 +
 290 files changed, 54116 insertions(+)
 create mode 100644 projects/clr/opencl/tests/ocltst/env/Module.h
 create mode 100644 projects/clr/opencl/tests/ocltst/env/ResultStruct.h
 create mode 100644 projects/clr/opencl/tests/ocltst/env/Timer.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/env/Timer.h
 create mode 100644 projects/clr/opencl/tests/ocltst/env/Worker.h
 create mode 100644 projects/clr/opencl/tests/ocltst/env/oclsysinfo.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/env/oclsysinfo.h
 create mode 100644 projects/clr/opencl/tests/ocltst/env/ocltst.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/env/pfm.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/env/pfm.h
 create mode 100644 projects/clr/opencl/tests/ocltst/include/OCL/Thread.h
 create mode 100644 projects/clr/opencl/tests/ocltst/include/OCLLog.h
 create mode 100644 projects/clr/opencl/tests/ocltst/include/OCLTest.h
 create mode 100644 projects/clr/opencl/tests/ocltst/include/OCLTestList.h
 create mode 100644 projects/clr/opencl/tests/ocltst/include/OCLTestUtils.h
 create mode 100644 projects/clr/opencl/tests/ocltst/include/OCLWrapper.h
 create mode 100644 projects/clr/opencl/tests/ocltst/log/oclTestLog.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/log/oclTestLog.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/common/BaseTestImp.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/common/OCLGLCommon.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/common/OCLGLCommon.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/common/OCLGLCommonLinux.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/common/OCLGLCommonWindows.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/common/OCLTestImp.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/common/OCLTestListImp.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/common/OCLTestUtils.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/common/OCLThread.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/common/OCLWrapper.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/common/Timer.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/common/Timer.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/dx/OCLDX11Common.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/dx/OCLDX11Common.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/dx/TestList.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/dx/ocldx.exclude
 create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLBuffer.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLBuffer.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLBufferMultipleQueues.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLBufferMultipleQueues.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthBuffer.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthBuffer.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthTex.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthTex.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLFenceSync.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLFenceSync.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLMsaaTexture.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLMsaaTexture.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLMultiContext.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLMultiContext.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLTexture.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLTexture.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/TestList.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/oclgl.exclude
 create mode 100644 projects/clr/opencl/tests/ocltst/module/include/BaseTestImp.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/include/OCLTestImp.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/include/OCLTestListImp.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/include/OclIncludes.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerf3DImageWriteSpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerf3DImageWriteSpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAES256.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAES256.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20Kernels.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeedKernels.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopyOverhead.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopyOverhead.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopySpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopySpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferReadSpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferReadSpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferWriteSpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferWriteSpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCPUMemSpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCPUMemSpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCommandQueue.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCommandQueue.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfConcurrency.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfConcurrency.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemReadSpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemReadSpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemWriteSpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemWriteSpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceConcurrency.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceConcurrency.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue2.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue2.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueEvent.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueEvent.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueSier.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueSier.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDispatchSpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDispatchSpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMA.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMA.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMASeq.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMASeq.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillBuffer.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillBuffer.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillImage.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillImage.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFlush.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFlush.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenoilSiaMiner.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenoilSiaMiner.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopyCorners.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopyCorners.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopySpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopySpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCreate.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCreate.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageMapUnmap.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageMapUnmap.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadSpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadSpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadWrite.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadWrite.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadsRGBA.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadsRGBA.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageSampleRate.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageSampleRate.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageWriteSpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageWriteSpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelArguments.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelArguments.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSLatency.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSLatency.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferReadSpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferReadSpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferWriteSpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferWriteSpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageReadSpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageReadSpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageWriteSpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageWriteSpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMatrixTranspose.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMatrixTranspose.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCombine.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCombine.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCreate.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCreate.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemLatency.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemLatency.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferReadSpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferReadSpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferWriteSpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferWriteSpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPipeCopySpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPipeCopySpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalRead.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalRead.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalWrite.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalWrite.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSHA256.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSHA256.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMAlloc.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMAlloc.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMKernelArguments.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMKernelArguments.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMap.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMap.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemFill.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemFill.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemcpy.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemcpy.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMSampleRate.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMSampleRate.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSampleRate.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSampleRate.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfScalarReplArrayElem.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfScalarReplArrayElem.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSdiP2PCopy.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSdiP2PCopy.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSepia.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSepia.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfTextureMemLatency.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfTextureMemLatency.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeed.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeed.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeedHostMem.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeedHostMem.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVWriteSpeedHostMem.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVWriteSpeedHostMem.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUncoalescedRead.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUncoalescedRead.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfVerticalFetch.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfVerticalFetch.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/TestList.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/oclperf.exclude
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncMap.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncMap.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncTransfer.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncTransfer.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLAtomicCounter.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLAtomicCounter.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLBlitKernel.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLBlitKernel.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLBufferFromImage.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLBufferFromImage.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLCPUGuardPages.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLCPUGuardPages.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateContext.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateContext.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateImage.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateImage.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceAtomic.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceAtomic.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamic.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamic.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamicBLines.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamicBLines.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLGenericAddressSpace.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLGenericAddressSpace.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLGetQueueThreadID.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLGetQueueThreadID.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLGlobalOffset.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLGlobalOffset.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLImage2DFromBuffer.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLImage2DFromBuffer.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLImageCopyPartial.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLImageCopyPartial.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLKernelBinary.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLKernelBinary.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLLDS32K.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLLDS32K.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLLinearFilter.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLLinearFilter.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLLiquidFlash.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLLiquidFlash.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMapCount.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMapCount.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMemObjs.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMemObjs.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMemoryInfo.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMemoryInfo.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLOfflineCompilation.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLOfflineCompilation.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLP2PBuffer.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLP2PBuffer.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPartialWrkgrp.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPartialWrkgrp.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPerfCounters.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPersistent.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPersistent.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPinnedMemory.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPinnedMemory.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPlatformAtomics.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPlatformAtomics.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLProgramScopeVariables.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLProgramScopeVariables.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLRTQueue.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLRTQueue.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLReadWriteImage.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLReadWriteImage.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLSDI.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLSDI.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLSVM.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLSVM.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLSemaphore.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLSemaphore.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLStablePState.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLStablePState.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLThreadTrace.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLThreadTrace.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLUnalignedCopy.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLUnalignedCopy.h
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/TestList.cpp
 create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/oclruntime.exclude

diff --git a/projects/clr/opencl/tests/ocltst/env/Module.h b/projects/clr/opencl/tests/ocltst/env/Module.h
new file mode 100644
index 0000000000..25e3017fa6
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/env/Module.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef OCL_TEST_MODULE_H
+#define OCL_TEST_MODULE_H
+
+#include <string>
+
+#include "OCLTest.h"
+#include "OCLTestList.h"
+
+struct Module {
+  std::string name;
+  ModuleHandle hmodule;
+  TestCountFuncPtr get_count;
+  TestNameFuncPtr get_name;
+  CreateTestFuncPtr create_test;
+  DestroyTestFuncPtr destroy_test;
+  TestVersionFuncPtr get_version;
+  TestLibNameFuncPtr get_libname;
+  OCLTest** cached_test;
+
+  Module()
+      : name(""),
+        hmodule(0),
+        get_count(0),
+        get_name(0),
+        create_test(0),
+        destroy_test(0),
+        get_version(0),
+        get_libname(0),
+        cached_test(0) {
+    // EMPTY!
+  }
+};
+
+#endif  // OCL_TEST_MODULE_H
diff --git a/projects/clr/opencl/tests/ocltst/env/ResultStruct.h b/projects/clr/opencl/tests/ocltst/env/ResultStruct.h
new file mode 100644
index 0000000000..198a6e67ff
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/env/ResultStruct.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _RESULT_STRUCT_H_
+
+struct IndicesRange {
+  int startIndex;
+  int endIndex;
+};
+
+#define INDEX_ALL_TESTS -1
+#define EXTREMELY_SMALL_VALUE -10000.0f
+#define EXTREMELY_LARGE_VALUE 10000.0f
+
+class TestResult {
+ public:
+  float value;
+  std::string resultString;
+  bool passed;
+
+  TestResult(float val) : resultString("\n"), passed(true) { value = val; }
+
+  void reset(float val) {
+    value = val;
+    passed = true;
+    resultString.assign("\n");
+  }
+};
+
+class Report {
+ public:
+  TestResult *max;
+  TestResult *min;
+  bool success;
+  int numFailedTests;
+
+  Report() : success(true), numFailedTests(0) {
+    max = new TestResult(EXTREMELY_SMALL_VALUE);
+    min = new TestResult(EXTREMELY_LARGE_VALUE);
+  }
+
+  void reset() {
+    max->reset(EXTREMELY_SMALL_VALUE);
+    min->reset(EXTREMELY_LARGE_VALUE);
+    success = true;
+    numFailedTests = 0;
+  }
+  ~Report() {
+    delete max;
+    delete min;
+  }
+};
+
+#endif  // _RESULT_STRUCT_H_
diff --git a/projects/clr/opencl/tests/ocltst/env/Timer.cpp b/projects/clr/opencl/tests/ocltst/env/Timer.cpp
new file mode 100644
index 0000000000..0b8baad859
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/env/Timer.cpp
@@ -0,0 +1,111 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "Timer.h"
+
+#ifdef ATI_OS_WIN
+#include <windows.h>
+#endif
+
+#ifdef ATI_OS_LINUX
+#include <sys/time.h>
+#endif
+
+CPerfCounter::CPerfCounter() : _clocks(0), _start(0) {
+#ifdef ATI_OS_WIN
+
+  QueryPerformanceFrequency((LARGE_INTEGER *)&_freq);
+
+#endif
+
+#ifdef ATI_OS_LINUX
+  _freq = 1000;
+#endif
+}
+
+CPerfCounter::~CPerfCounter() {
+  // EMPTY!
+}
+
+void CPerfCounter::Start(void) {
+#ifdef ATI_OS_WIN
+
+  if (_start) {
+    MessageBox(NULL, "Bad Perf Counter Start", "Error", MB_OK);
+    exit(0);
+  }
+  QueryPerformanceCounter((LARGE_INTEGER *)&_start);
+
+#endif
+#ifdef ATI_OS_LINUX
+
+  struct timeval s;
+  gettimeofday(&s, 0);
+  _start = (i64)s.tv_sec * 1000 + (i64)s.tv_usec / 1000;
+
+#endif
+}
+
+void CPerfCounter::Stop(void) {
+  i64 n;
+
+#ifdef ATI_OS_WIN
+
+  if (!_start) {
+    MessageBox(NULL, "Bad Perf Counter Stop", "Error", MB_OK);
+    exit(0);
+  }
+
+  QueryPerformanceCounter((LARGE_INTEGER *)&n);
+
+#endif
+#ifdef ATI_OS_LINUX
+
+  struct timeval s;
+  gettimeofday(&s, 0);
+  n = (i64)s.tv_sec * 1000 + (i64)s.tv_usec / 1000;
+
+#endif
+
+  n -= _start;
+  _start = 0;
+  _clocks += n;
+}
+
+void CPerfCounter::Reset(void) {
+#ifdef ATI_OS_WIN
+  if (_start) {
+    MessageBox(NULL, "Bad Perf Counter Reset", "Error", MB_OK);
+    exit(0);
+  }
+#endif
+  _clocks = 0;
+}
+
+double CPerfCounter::GetElapsedTime(void) {
+#ifdef ATI_OS_WIN
+  if (_start) {
+    MessageBox(NULL, "Trying to get time while still running.", "Error", MB_OK);
+    exit(0);
+  }
+#endif
+
+  return (double)_clocks / (double)_freq;
+}
diff --git a/projects/clr/opencl/tests/ocltst/env/Timer.h b/projects/clr/opencl/tests/ocltst/env/Timer.h
new file mode 100644
index 0000000000..058e00c44f
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/env/Timer.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _TIMER_H_
+#define _TIMER_H_
+
+#ifdef ATI_OS_WIN
+typedef __int64 i64;
+#endif
+#ifdef ATI_OS_LINUX
+typedef long long i64;
+#endif
+
+class CPerfCounter {
+ public:
+  CPerfCounter();
+  ~CPerfCounter();
+  void Start(void);
+  void Stop(void);
+  void Reset(void);
+  double GetElapsedTime(void);
+
+ private:
+  i64 _freq;
+  i64 _clocks;
+  i64 _start;
+};
+
+#endif  // _TIMER_H_
diff --git a/projects/clr/opencl/tests/ocltst/env/Worker.h b/projects/clr/opencl/tests/ocltst/env/Worker.h
new file mode 100644
index 0000000000..b9e29d370b
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/env/Worker.h
@@ -0,0 +1,180 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef OCL_TEST_WORKER_H
+#define OCL_TEST_WORKER_H
+
+/////////////////////////////////////////////////////////////////////////////
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <cstring>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "Module.h"
+#include "OCLTest.h"
+#include "OCLTestList.h"
+#include "ResultStruct.h"
+#include "Timer.h"
+#include "getopt.h"
+#include "pfm.h"
+
+/////////////////////////////////////////////////////////////////////////////
+
+typedef void* (*TestMethod)(void* param);
+
+/////////////////////////////////////////////////////////////////////////////
+
+class Worker {
+ public:
+  Worker()
+      : m_wrapper(0),
+        m_module(0),
+        m_run(0),
+        m_id(0),
+        m_subtest(0),
+        m_testindex(0),
+        m_dump(false),
+        m_display(false),
+        m_useCPU(false),
+        m_window(0),
+        m_width(0),
+        m_height(0),
+        m_buffer(0),
+        m_perflab(false),
+        m_deviceId(0),
+        m_platform(0) {
+    // EMPTY!
+  }
+
+  Worker(OCLWrapper* wrapper, Module* module, TestMethod run, unsigned int id,
+         unsigned int subtest, unsigned int testindex, bool dump, bool view,
+         bool useCPU, void* window, unsigned int x, unsigned int y,
+         bool perflab, unsigned int deviceId = 0, unsigned int platform = 0)
+      : m_wrapper(wrapper),
+        m_module(module),
+        m_run(run),
+        m_id(id),
+        m_subtest(subtest),
+        m_testindex(testindex),
+        m_dump(dump),
+        m_display(view),
+        m_useCPU(useCPU),
+        m_window(window),
+        m_width(x),
+        m_height(y),
+        m_buffer(0),
+        m_perflab(perflab),
+        m_deviceId(deviceId),
+        m_platform(platform) {
+    if (m_dump == true || m_display == true) {
+      m_buffer = new float[4 * m_width * m_height];
+      if (m_buffer != 0) {
+        memset(m_buffer, 0, 4 * m_width * m_height * sizeof(float));
+      } else {
+        m_dump = false;
+        m_display = false;
+      }
+    }
+    m_result = new TestResult(0.0f);
+  }
+
+  Worker(const Worker& w) {
+    if (this == &w) return;
+
+    if (m_buffer) delete[] m_buffer;
+    m_buffer = 0;
+
+    m_wrapper = w.m_wrapper;
+    m_module = w.m_module;
+    m_run = w.m_run;
+    m_id = w.m_id;
+    m_subtest = w.m_subtest;
+    m_testindex = w.m_testindex;
+    m_dump = w.m_dump;
+    m_display = w.m_display;
+    m_useCPU = w.m_useCPU;
+    m_window = w.m_window;
+    m_width = w.m_width;
+    m_height = w.m_height;
+    m_perflab = w.m_perflab;
+    m_deviceId = w.m_deviceId;
+    m_result = w.m_result;
+    m_platform = w.m_platform;
+
+    if (w.m_buffer) {
+      m_buffer = new float[4 * m_width * m_height];
+      if (m_buffer != 0) {
+        memcpy(m_buffer, w.m_buffer, 4 * m_width * m_height * sizeof(float));
+      }
+    }
+  }
+
+  ~Worker() {
+    if (m_buffer) delete[] m_buffer;
+    m_buffer = 0;
+    delete m_result;
+    m_result = 0;
+  }
+
+  OCLWrapper* getOCLWrapper() { return m_wrapper; }
+  Module* getModule() { return m_module; }
+  TestMethod getTestMethod() { return m_run; }
+  unsigned int getId() { return m_id; }
+  unsigned int getSubTest() { return m_subtest; }
+  unsigned int getTestIndex() { return m_testindex; }
+  bool isDumpEnabled() { return m_dump; }
+  bool isDisplayEnabled() { return m_display; }
+  bool isCPUEnabled() { return m_useCPU; }
+  void* getWindow() { return m_window; }
+  unsigned int getWidth() { return m_width; }
+  unsigned int getHeight() { return m_height; }
+  float* getBuffer() { return m_buffer; }
+  bool getPerflab() { return m_perflab; }
+  unsigned int getDeviceId() { return m_deviceId; }
+  TestResult* getResult() { return m_result; }
+  unsigned int getPlatformID() { return m_platform; }
+
+ private:
+  OCLWrapper* m_wrapper;
+  Module* m_module;
+  TestMethod m_run;
+  unsigned int m_id;
+  unsigned int m_subtest;
+  unsigned int m_testindex;
+  bool m_dump;
+  bool m_display;
+  bool m_useCPU;
+  void* m_window;
+  unsigned int m_width;
+  unsigned int m_height;
+  float* m_buffer;
+  bool m_perflab;
+  unsigned int m_deviceId;
+  unsigned int m_platform;
+  TestResult* m_result;
+};
+
+/////////////////////////////////////////////////////////////////////////////
+
+#endif  // OCL_TEST_WORKER_H
diff --git a/projects/clr/opencl/tests/ocltst/env/oclsysinfo.cpp b/projects/clr/opencl/tests/ocltst/env/oclsysinfo.cpp
new file mode 100644
index 0000000000..02e2a0402d
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/env/oclsysinfo.cpp
@@ -0,0 +1,162 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "oclsysinfo.h"
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+
+#include <cstdio>
+
+#ifndef MAX_DEVICES
+#define MAX_DEVICES 16
+#endif  // MAX_DEVICES
+
+int oclSysInfo(std::string &info_string, bool use_cpu, unsigned dev_id,
+               unsigned int platformIndex) {
+  /*
+   * Have a look at the available platforms and pick the one
+   * in the platforms vector in index "platformIndex".
+   */
+
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+
+  int error = clGetPlatformIDs(0, NULL, &numPlatforms);
+  if (CL_SUCCESS != error) {
+    fprintf(stderr, "clGetPlatformIDs() failed");
+    return 0;
+  }
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error = clGetPlatformIDs(numPlatforms, platforms, NULL);
+    if (CL_SUCCESS != error) {
+      fprintf(stderr, "clGetPlatformIDs() failed");
+      return 0;
+    }
+#if 0
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+            /* Get the number of requested devices */
+            error = clGetDeviceIDs(platforms[i],  (use_cpu) ? CL_DEVICE_TYPE_CPU : CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices );
+#if 0
+            /* clGetDeviceIDs fails when no GPU devices are present */
+            if (error) {
+              fprintf(stderr, "clGetDeviceIDs failed: %d\n", error );
+              return 0;
+            }
+#endif
+#if 0
+            char pbuf[100];
+
+            error = clGetPlatformInfo(
+                         platforms[i],
+                         CL_PLATFORM_VENDOR,
+                         sizeof(pbuf),
+                         pbuf,
+                         NULL);
+            if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+                platform = platforms[i];
+                break;
+            }
+#else
+            /* Select platform with GPU devices  present */
+            if (num_devices > 0) {
+                platform = platforms[i];
+                break;
+            }
+#endif
+		}
+#endif
+    error = clGetDeviceIDs(platforms[platformIndex],
+                           (use_cpu) ? CL_DEVICE_TYPE_CPU : CL_DEVICE_TYPE_GPU,
+                           0, NULL, &num_devices);
+    if (error) {
+      fprintf(stderr, "clGetDeviceIDs failed: %d\n", error);
+      return 0;
+    }
+    platform = platforms[platformIndex];
+    delete[] platforms;
+  }
+  if (dev_id >= num_devices) {
+    fprintf(stderr, "Device selected does not exist.\n");
+    return 0;
+  }
+  if (NULL == platform) {
+    fprintf(stderr,
+            "Couldn't find platform with GPU devices, cannot proceed.\n");
+    return 0;
+  }
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  if (!devices) {
+    fprintf(stderr, "no devices\n");
+    return 0;
+  }
+
+  /* Get the requested device */
+  error = clGetDeviceIDs(platform,
+                         (use_cpu) ? CL_DEVICE_TYPE_CPU : CL_DEVICE_TYPE_GPU,
+                         num_devices, devices, NULL);
+  if (error) {
+    fprintf(stderr, "clGetDeviceIDs failed: %d\n", error);
+    return 0;
+  }
+
+  device = devices[dev_id];
+
+  char c[1024];
+  char tmpString[256];
+  static const char *no_yes[] = {"NO", "YES"};
+  sprintf(tmpString, "\nCompute Device info:\n");
+  info_string.append(tmpString);
+  clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(c), &c, NULL);
+  sprintf(tmpString, "\tPlatform Version: %s\n", c);
+  info_string.append(tmpString);
+  clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(c), &c, NULL);
+  sprintf(tmpString, "\tDevice Name: %s\n", c);
+  info_string.append(tmpString);
+  clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(c), &c, NULL);
+  sprintf(tmpString, "\tVendor: %s\n", c);
+  info_string.append(tmpString);
+  clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(c), &c, NULL);
+  sprintf(tmpString, "\tDevice Version: %s\n", c);
+  info_string.append(tmpString);
+  clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(c), &c, NULL);
+  sprintf(tmpString, "\tDriver Version: %s\n", c);
+  info_string.append(tmpString);
+  clGetDeviceInfo(device, CL_DEVICE_BOARD_NAME_AMD, sizeof(c), &c, NULL);
+  sprintf(tmpString, "\tBoard Name: %s\n", c);
+  info_string.append(tmpString);
+#if defined(ATI_OS_LINUX)
+  cl_device_topology_amd topology;
+  clGetDeviceInfo(device, CL_DEVICE_TOPOLOGY_AMD, sizeof(topology), &topology,
+                  NULL);
+  if (topology.raw.type == CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD) {
+    sprintf(tmpString, "\tDevice Topology: PCI[ B#%d, D#%d, F#%d]\n",
+            topology.pcie.bus, topology.pcie.device, topology.pcie.function);
+    info_string.append(tmpString);
+  }
+#endif
+  free(devices);
+  return 1;
+}
diff --git a/projects/clr/opencl/tests/ocltst/env/oclsysinfo.h b/projects/clr/opencl/tests/ocltst/env/oclsysinfo.h
new file mode 100644
index 0000000000..4fd1fa2d16
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/env/oclsysinfo.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCLSYSINFO_H_
+#define _OCLSYSINFO_H_
+#include <string>
+
+int oclSysInfo(std::string& info_string, bool useCPU, unsigned dev_id,
+               unsigned int platformIndex = 0);
+
+#endif  //_OCLSYSINFO_H_
diff --git a/projects/clr/opencl/tests/ocltst/env/ocltst.cpp b/projects/clr/opencl/tests/ocltst/env/ocltst.cpp
new file mode 100644
index 0000000000..888059fce7
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/env/ocltst.cpp
@@ -0,0 +1,1611 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+/////////////////////////////////////////////////////////////////////////////
+
+#include <CL/cl.h>
+
+#ifdef ATI_OS_WIN
+#include <windows.h>
+
+#include "Window.h"
+typedef HMODULE ModuleHandle;
+#endif
+
+/////////////////////////////////////////////////////////////////////////////
+
+#ifdef ATI_OS_LINUX
+#include <dlfcn.h>
+typedef void* ModuleHandle;
+#endif
+
+/////////////////////////////////////////////////////////////////////////////
+
+#include "BaseTestImp.h"
+#include "Module.h"
+#include "OCLLog.h"
+#include "OCLTest.h"
+#include "OCLTestImp.h"
+#include "OCLTestList.h"
+#include "OCLWrapper.h"
+#include "Timer.h"
+#include "Worker.h"
+#include "getopt.h"
+#include "oclsysinfo.h"
+#include "pfm.h"
+
+//! Including OCLutilities Thread utility
+#include "OCL/Thread.h"
+
+//! Lock that needs to be obtained to access the global
+//! module variable
+static OCLutil::Lock moduleLock;
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+/////////////////////////////////////////////////////////////////////////////
+
+#ifdef ATI_OS_WIN
+static LONG WINAPI xFilter(LPEXCEPTION_POINTERS xEP);
+void serviceStubCall();
+#endif
+
+#define MAX_DEVICES 16
+#undef CHECK_RESULT
+#define CHECK_RESULT(test, msg) \
+  if ((test)) {                 \
+    printf("\n%s\n", msg);      \
+    exit(1);                    \
+  }
+
+//! Declaration of a function that find devices of a specific type for the
+//! chosen platform
+int findAdapters(unsigned int platformIdx, bool useCPU, cl_platform_id*);
+
+//! class App that is used to run the tests on the system
+class App {
+ public:
+  static bool m_reRunFailed;
+  static bool m_svcMsg;
+  //! Constructor for App
+  App(unsigned int platform)
+      : m_list(false),
+        m_console(true),
+        m_useCPU(false),
+        m_dump(false),
+        m_perflab(false),
+        m_noSysInfoPrint(false),
+        m_numItr(1),
+        mp_testOrder(NULL),
+        m_rndOrder(false),
+        m_spawned(0),
+        m_threads(1),
+        m_runthread(0),
+        m_width(512),
+        m_height(512),
+        m_window(0),
+        m_platform(platform) {
+    // initialize OCLWrapper reference
+    m_wrapper = new OCLWrapper();
+
+    // m_workers = Set of worker objects that are used to run a subtest from a
+    // module
+    for (unsigned int i = 0; i < 256; i++) m_workers[i] = 0;
+
+    // Setting the number of devices
+    /*
+     * Force caltst to use 1 thread at a time in Windows
+     * only contextual calls are thread safe currently
+     */
+    m_numDevices = findAdapters(m_platform, m_useCPU, NULL);
+    // m_numDevices = 1;
+
+    // Report structure used to store the results of the tests
+#if 0
+            testReport = (Report **)malloc(sizeof(Report *) * m_numDevices);
+            for(unsigned int i = 0; i < m_numDevices; i++)
+            {
+                testReport[i] = new Report;
+            }
+#else
+    testReport = (Report**)malloc(sizeof(Report*));
+    testReport[0] = new Report;
+#endif
+  }
+
+  //! Destructor for App
+  ~App() {
+    // Deleting the Worker objects
+    for (unsigned int i = 0; i < 256; i++) {
+      if (m_workers[i]) {
+        delete m_workers[i];
+        m_workers[i] = 0;
+      }
+    }
+
+    // Deleting the report structures
+    // for(unsigned int i = 0; i < m_numDevices; i++)
+    for (unsigned int i = 0; i < 1; i++) {
+      delete testReport[i];
+    }
+    free(testReport);
+    m_wrapper->clUnloadPlatformAMD(mpform_id);
+
+    delete m_wrapper;
+  }
+
+  //! Function used to create a worker object corresponding to a subtest in a
+  //! module
+  void SetWorker(unsigned int index, OCLWrapper* wrapper, Module* module,
+                 TestMethod run, unsigned int id, unsigned int subtest,
+                 unsigned int test, bool dump, bool view, bool useCPU,
+                 void* window, unsigned int x, unsigned int y, bool perflab,
+                 unsigned int deviceId, unsigned int platform) {
+    if (index >= 256) return;
+
+    if (m_workers[index]) delete m_workers[index];
+
+    m_workers[index] =
+        new Worker(wrapper, module, run, id, subtest, test, dump, view, useCPU,
+                   window, x, y, perflab, deviceId, platform);
+
+    assert(m_workers[index] != 0);
+    // oclTestLog(OCLTEST_LOG_ALWAYS, "Worker Device Id = %d\n",
+    // m_workers[index]->getDeviceId());
+  }
+
+  //! Function to return the 'index'th m_workers
+  Worker* GetWorker(unsigned int index) {
+    if (index >= 256) return 0;
+
+    return m_workers[index];
+  }
+
+  //! Create a thread to run the subtest
+  void AddThread(unsigned int workerindex, unsigned int usage) {
+    Worker* worker = GetWorker(workerindex);
+    if (worker == 0) {
+      return;
+    }
+
+    // usage = Whether to use threads or not
+    if (usage != 0) {
+      // Creating a thread
+      // getTestMethod = runSubTest here
+      // which takes a Worker object as an argument
+      m_pool[workerindex].create(worker->getTestMethod(), (void*)(worker));
+      m_spawned++;
+    } else {
+      // Same as above without using threads
+      TestMethod run = worker->getTestMethod();
+      if (run) {
+        run(worker);
+        UpdateTestReport(workerindex, worker->getResult());
+      }
+    }
+    return;
+  }
+
+  //! Function which waits for all threads to execute and also updates the
+  //! report
+  void WaitAllThreads() {
+    for (unsigned int w = 0; w < m_spawned; w++) {
+      m_pool[w].join();
+      UpdateTestReport(w, m_workers[w]->getResult());
+    }
+    m_spawned = 0;
+  }
+
+  //! Function to add a worker thread so as to run a subtest of a module
+  //! @param run = runSubtest function
+  //! @param index = index of the module in m_modules
+  //! @param subtest = the subtest number to run
+  //! @param usage = whether to use threads or not
+  //! @param test = The test in the module to be executed
+  void AddWorkerThread(unsigned int index, unsigned int subtest,
+                       unsigned int test, unsigned int usage, TestMethod run) {
+    if (m_spawned > m_threads) {
+      WaitAllThreads();
+    }
+
+    // Creating a worker thread for each device
+#if 0
+            for(unsigned int i = 0; i < m_numDevices; i++)
+            {
+                SetWorker(i,
+                          m_wrapper,
+                          &m_modules[index],
+                          run,
+                          m_spawned,
+                          subtest,
+                          test,
+                          m_dump,
+                          !m_console,
+                          m_useCPU,
+                          m_window,
+                          m_width,
+                          m_height,
+                          m_perflab,
+                          i,
+                          m_platform);            
+            }
+#else
+    for (unsigned int i = 0; i < 1; i++) {
+      SetWorker(i, m_wrapper, &m_modules[index], run, m_spawned, subtest, test,
+                m_dump, !m_console, m_useCPU, m_window, m_width, m_height,
+                m_perflab, m_deviceId, m_platform);
+    }
+#endif
+
+    // Creating and executing a thread for each device
+    // for(unsigned int i = 0; i < m_numDevices; i++)
+    for (unsigned int i = 0; i < 1; i++) {
+      AddThread(i, usage);
+    }
+  }
+
+  void printOCLinfo(void);
+
+  //! Function to process the commandline arguments
+  void CommandLine(unsigned int argc, char** argv);
+
+  //! Function to scan for the different tests in the module
+  void ScanForTests();
+
+  //! Function to run all the specified tests
+  void RunAllTests();
+
+  //! Free memory
+  void CleanUp();
+
+  //! Function to set the order in which test are executed.
+  void SetTestRunOrder(int);
+
+  //! Function to print the test order
+  void PrintTestOrder(int);
+
+  //! Function to get the number of iterations.
+  int GetNumItr(void) { return m_numItr; }
+
+ private:
+  typedef std::vector<unsigned int> TestIndexList;
+  typedef std::vector<std::string> StringList;
+
+  void AddToList(StringList& strlist, const char* str);
+  void LoadList(StringList& strlist, const char* filename);
+
+  bool TestInList(StringList& strlist, const char* testname);
+
+  //! Array storing the report for each device
+  Report** testReport;
+
+  //! Function to update the result of each device
+  void UpdateTestReport(int index, TestResult* result) {
+    if (result != NULL) {
+      if (result->passed) {
+        if (testReport[index]->max->value < result->value) {
+          testReport[index]->max->value = result->value;
+          testReport[index]->max->resultString = result->resultString;
+        }
+        if (testReport[index]->min->value > result->value) {
+          testReport[index]->min->value = result->value;
+          testReport[index]->min->resultString = result->resultString;
+        }
+      } else {
+        testReport[index]->numFailedTests++;
+        testReport[index]->success = false;
+      }
+    } else {
+      testReport[index]->numFailedTests++;
+      testReport[index]->success = false;
+    }
+  }
+
+  //! Functions used to find the range of the tests to be run
+  void GetTestIndexList(TestIndexList& testIndices, StringList& testList,
+                        const char* szModuleTestname, int maxIndex);
+  void PruneTestIndexList(TestIndexList& testIndices,
+                          TestIndexList& avoidIndices,
+                          TestIndexList& erasedIndices);
+
+  StringList m_paths;
+  StringList m_tests;
+  StringList m_avoid;
+  std::vector<Module> m_modules;
+  bool m_list;
+  bool m_console;
+  bool m_useCPU;
+  bool m_dump;
+  bool m_perflab;
+  bool m_noSysInfoPrint;
+  int m_numItr;
+  int* mp_testOrder;
+  bool m_rndOrder;
+
+  //! m_pool = Various threads created to execute tests on multiple devices
+  OCLutil::Thread m_pool[256];
+
+  Worker* m_workers[256];
+
+  //! Number of threads spawned
+  unsigned int m_spawned;
+
+  //! Upper limit on the number of threads that can be spawned
+  unsigned int m_threads;
+  unsigned int m_runthread;
+  unsigned int m_width;
+  unsigned int m_height;
+  void* m_window;
+
+  //! which index/platform id from the platforms vector returned by
+  //! cl::Platform::get we should run on
+  unsigned int m_platform;
+  cl_platform_id mpform_id;
+
+  //! Number of devices on the system
+  unsigned int m_numDevices;
+  //
+  //! Device ID to use on the system
+  unsigned int m_deviceId;
+
+  // OCLWrapper reference
+  OCLWrapper* m_wrapper;
+};
+
+void App::printOCLinfo(void) {
+  std::string calinfo;
+  if (!m_noSysInfoPrint) {
+    oclSysInfo(calinfo, m_useCPU, m_deviceId, m_platform);
+    oclTestLog(OCLTEST_LOG_ALWAYS, calinfo.c_str());
+  }
+}
+
+/*-----------------------------------------------------
+Function to randomize the order in which tests are executed
+-------------------------------------------------------*/
+#ifdef ATI_OS_WIN
+#include <time.h>
+#endif
+// void App::SetTestRunOrder(int test_count)
+void App::SetTestRunOrder(int mod_index) {
+  assert(mp_testOrder != NULL);
+  unsigned int test_count = m_modules[mod_index].get_count();
+
+  StringList uniqueTests;
+  for (unsigned int i = 0; i < m_tests.size(); ++i) {
+    // see if the tests are being run using indices
+    size_t nFirstBracket = m_tests[i].find("[");
+    // set the test name
+    std::string szTestName = m_tests[i];
+
+    // order of execution is set based on base name so get the base name
+    if (nFirstBracket != std::string::npos)
+      szTestName = szTestName.substr(0, nFirstBracket);
+
+    bool bTestExists = false;
+    for (unsigned int j = 0; j < uniqueTests.size(); ++j) {
+      if (strcmp(szTestName.c_str(), uniqueTests[j].c_str()) == 0) {
+        bTestExists = true;
+        break;
+      }
+    }
+
+    if (!bTestExists) {
+      AddToList(uniqueTests, szTestName.c_str());
+    }
+  }
+
+  for (unsigned int i = 0; i < test_count && i < uniqueTests.size(); i++) {
+    for (unsigned int j = 0; j < test_count; j++) {
+      unsigned int index = i;
+      // add all the prev test indices
+      for (int k = 0; k < mod_index; k++) index += m_modules[k].get_count();
+
+      std::string szTestName = uniqueTests[index];
+
+      if (strcmp(szTestName.c_str(), m_modules[mod_index].get_name(j)) == 0) {
+        mp_testOrder[i] = j;
+        break;
+      }
+    }
+  }
+
+  if (m_rndOrder) {
+    srand((unsigned int)time(NULL));
+    for (unsigned int i = 0; i < test_count; i++) {
+      // find two random indices
+      int index1 = (int)((float)test_count * (rand() / (RAND_MAX + 1.0)));
+      int index2 = (int)((float)test_count * (rand() / (RAND_MAX + 1.0)));
+      // swap the data
+      int tmp = mp_testOrder[index1];
+      mp_testOrder[index1] = mp_testOrder[index2];
+      mp_testOrder[index2] = tmp;
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////
+
+// Process device string. Returns true if there is a primary ATI Radeon device
+// adapter, false otherwise
+static bool procDevString(const char* devString) {
+  // Search for the string "Radeon" inside the device string
+  if (strstr(devString, "Radeon") || strstr(devString, "R600") ||
+      strstr(devString, "RV630") || strstr(devString, "RV670") ||
+      (strstr(devString, "Stream") && strstr(devString, "Processor"))) {
+    // Ignore if the device is a secondary device, i.e., not an adapter
+    if (strstr(devString, "Secondary")) {
+      return false;
+    }
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+//!
+//! Function to find the total number of adapters on the system
+//!
+int findAdapters(unsigned int platformIdx, bool useCPU,
+                 cl_platform_id* mpform) {
+  unsigned int numOfAdapters = 0;
+  cl_int error = 0;
+  cl_uint numPlatforms = 0;
+
+  error = clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT((error != CL_SUCCESS), "clGetPlatformIDs failed");
+
+  CHECK_RESULT((platformIdx >= numPlatforms), "Invalid platform");
+
+  cl_platform_id* platforms = new cl_platform_id[numPlatforms];
+  error = clGetPlatformIDs(numPlatforms, platforms, NULL);
+  CHECK_RESULT(error != CL_SUCCESS, "clGetPlatformIDs failed");
+
+  cl_platform_id platform = 0;
+
+  platform = platforms[platformIdx];
+
+  delete[] platforms;
+
+  cl_device_type devType = CL_DEVICE_TYPE_GPU;
+  if (useCPU) devType = CL_DEVICE_TYPE_CPU;
+  error = clGetDeviceIDs(platform, devType, 0, 0, &numOfAdapters);
+  CHECK_RESULT((error != CL_SUCCESS), "clGetDeviceIDs failed");
+  if (mpform) {
+    (*mpform) = platform;
+  }
+
+  return (int)numOfAdapters;
+}
+
+int calibrate(OCLTest* test) {
+  int n = 1;
+
+#if 0
+    while (1)
+    {
+        double timer = run(test, n);
+        if (timer > 2.)
+        {
+            break;
+        }
+        n *= 2;
+    }
+#endif
+
+  return n;
+}
+
+void* dummyThread(void* argv) {
+  unsigned int counter = 0;
+  while (counter < 1000000) counter++;
+
+  return argv;
+}
+
+//! Function used to run the test specified
+//! It would look something like OCLPerfInputspeed[0]
+double run(OCLTest* test, int passes) {
+  CPerfCounter counter;
+
+  counter.Reset();
+  counter.Start();
+  int i;
+  for (i = 0; i < passes; i++) {
+    test->run();
+  }
+  counter.Stop();
+  double timer = counter.GetElapsedTime();
+  counter.Reset();
+
+  return timer;
+}
+
+//! Function to display the result after a test is finished
+//! It also stores the result in a TestResult object
+void report(Worker* w, const char* testname, int testnum, unsigned int crc,
+            const char* errorMsg, float timer, TestResult* tr,
+            const char* testDesc) {
+  unsigned int thread = w->getId();
+  bool perflab = w->getPerflab();
+  unsigned int deviceId = w->getDeviceId();
+
+  char tmpUnits[256];
+  if (perflab) {
+    oclTestLog(OCLTEST_LOG_ALWAYS, "%10.3f\n", timer);
+  } else {
+    const char* passedOrFailed[] = {"FAILED", "PASSED"};
+
+    // char teststring[256];
+    // sprintf(teststring, "%s[%d]", testname, testnum);
+    // sprintf(tmpUnits, "Device[%d]:\t%-32s:\t%s\n", deviceId, teststring,
+    // ((tr->passed) ? passedOrFailed[1] : passedOrFailed[0]));
+    // If crc is not 0 or errorMsg is not empty, print the full stats
+    if ((crc != 0) || (errorMsg && (errorMsg[0] != '\0'))) {
+      sprintf(tmpUnits,
+              "%s %s: %s[%d] T[%1d] [%3d], %10.3f %-20s (chksum 0x%08x)\n",
+              testDesc, ((tr->passed) ? passedOrFailed[1] : passedOrFailed[0]),
+              w->isCPUEnabled() ? "CPU" : "GPU", deviceId, thread, testnum,
+              timer, errorMsg, crc);
+    } else {
+      sprintf(tmpUnits, "%s %s: %s[%d] T[%1d] [%3d], %10.3f\n", testDesc,
+              ((tr->passed) ? passedOrFailed[1] : passedOrFailed[0]),
+              w->isCPUEnabled() ? "CPU" : "GPU", deviceId, thread, testnum,
+              timer);
+    }
+
+    oclTestLog(OCLTEST_LOG_ALWAYS, tmpUnits);
+
+    tr->value = timer;
+    tr->resultString.assign(tmpUnits);
+
+    if (App::m_svcMsg && !tr->passed) {
+      char escaped[2 * sizeof(tmpUnits)];
+
+      char* ptr = escaped;
+      for (int i = 0; tmpUnits[i] != '\0'; ++i) {
+        switch (tmpUnits[i]) {
+          case '\n':
+            *ptr++ = '|';
+            *ptr++ = 'n';
+            break;
+          case '\r':
+            *ptr++ = '|';
+            *ptr++ = 'r';
+            break;
+          case '\'':
+          case '|':
+          case ']':
+          case '[':
+            *ptr++ = '|';
+          default:
+            *ptr++ = tmpUnits[i];
+        }
+      }
+      *ptr = '\0';
+
+      oclTestLog(OCLTEST_LOG_ALWAYS,
+                 "##teamcity[testFailed name='%s.%s.%d' message='FAILED' "
+                 "details='%s']\n",
+                 w->getModule()->get_libname(), testname, testnum, escaped);
+    }
+  }
+}
+
+//! Thread Entry point
+void* runSubtest(void* worker) {
+  char units[256];
+  double conversion;
+  unsigned int crc = 0;
+  bool second_run = false;
+
+  // Getting the worker object that is running in this thread
+  Worker* w = (Worker*)worker;
+
+  if (w == 0) return NULL;
+
+  unsigned int test = w->getTestIndex();
+  unsigned int subtest = w->getSubTest();
+  unsigned int deviceId = w->getDeviceId();
+  unsigned int platformIndex = w->getPlatformID();
+  TestResult* result = w->getResult();
+
+RERUN_TEST:
+  // Acquiring lock on the 'module' object common to all threads
+  moduleLock.lock();
+  Module* m = w->getModule();
+  if (m == 0 || m->create_test == 0) return NULL;
+  // If we can, used the cached version,
+  // otherwise create the test.
+  OCLTest* pt = (m->cached_test ? m->cached_test[subtest] : NULL);
+  if (!pt) {
+    pt = m->create_test(subtest);
+    if (pt->cache_test() && m->cached_test) {
+      m->cached_test[subtest] = pt;
+    }
+  }
+  pt->clearError();
+  OCLTestImp* tmp = pt->toOCLTestImp();
+  if (tmp) {
+    tmp->setOCLWrapper(w->getOCLWrapper());
+  }
+  std::string subtestName = m->get_name(subtest);
+  moduleLock.unlock();
+
+  if (pt == 0) return NULL;
+
+  pt->resetDescString();
+  if (App::m_svcMsg) {
+    oclTestLog(OCLTEST_LOG_ALWAYS,
+               "##teamcity[testStarted name='%s.%s.%d' "
+               "captureStandardOutput='true']\n",
+               m->get_libname(), subtestName.c_str(), test);
+  }
+  // setting the type to CPU.
+  if (w->isCPUEnabled()) {
+    pt->useCPU();
+  }
+  // Setting the device according to the worker thread
+  pt->setDeviceId(w->getDeviceId());
+  pt->setPlatformIndex(w->getPlatformID());
+  // Opening the 'test'th subtest of 'pt'
+  pt->open(test, units, conversion, deviceId);
+  pt->clearPerfInfo();
+
+  char buffer[256];
+  sprintf(buffer, "%s[%3d]", subtestName.c_str(), test);
+  oclTestLog(OCLTEST_LOG_ALWAYS, "%-32s", buffer);
+
+  if (pt->hasErrorOccured()) {
+    result->passed = false;
+    report(w, subtestName.c_str(), test, crc, pt->getErrorMsg(),
+           pt->getPerfInfo(), result, pt->testDescString.c_str());
+  } else {
+    unsigned int n = calibrate(pt);
+    double timer = run(pt, n);
+    crc = pt->close();
+
+    if (pt->hasErrorOccured()) {
+      // run second time if the test fails the first time.
+      if (!second_run && App::m_reRunFailed && !App::m_svcMsg) {
+        second_run = true;
+
+        // Destroying a test object
+        moduleLock.lock();
+        if (!pt->cache_test()) {
+          m->destroy_test(pt);
+        }
+        moduleLock.unlock();
+
+        pt->clearError();
+        goto RERUN_TEST;
+      }
+    }
+    result->passed = !pt->hasErrorOccured();
+    /// print conditional pass if it is passes the second time.
+    if (second_run && result->passed) {
+      report(w, subtestName.c_str(), test, crc, "Conditional PASS",
+             pt->getPerfInfo(), result, pt->testDescString.c_str());
+    } else {
+      report(w, subtestName.c_str(), test, crc, pt->getErrorMsg(),
+             pt->getPerfInfo(), result, pt->testDescString.c_str());
+    }
+  }
+  if (App::m_svcMsg) {
+    oclTestLog(OCLTEST_LOG_ALWAYS, "##teamcity[testFinished name='%s.%s.%d']\n",
+               m->get_libname(), subtestName.c_str(), test);
+  }
+
+  // Make sure we clear the error after we report that there was an error.
+  pt->clearError();
+
+  // Destroying a test object
+  moduleLock.lock();
+  if (!pt->cache_test()) {
+    m->destroy_test(pt);
+  }
+  moduleLock.unlock();
+  return NULL;
+}
+
+void App::PrintTestOrder(int mod_index) {
+  oclTestLog(OCLTEST_LOG_ALWAYS, "Module: %s (%d tests)\n",
+             m_modules[mod_index].name.c_str(),
+             m_modules[mod_index].get_count());
+
+  for (unsigned int j = 0; j < m_modules[mod_index].get_count(); j++) {
+    oclTestLog(OCLTEST_LOG_ALWAYS, "%s\n",
+               m_modules[mod_index].get_name(mp_testOrder[j]));
+  }
+}
+
+//! Function that runs all the tests specified in the command-line
+void App::RunAllTests() {
+#ifdef ATI_OS_WIN
+
+  if (!m_console) m_window = new Window("Test", 100, 100, m_width, m_height, 0);
+#endif
+
+  //
+  //  Add all tests to run list if none specified
+  //
+  if (m_tests.size() < 1) {
+    for (unsigned int i = 0; i < m_modules.size(); i++) {
+      for (unsigned int j = 0; j < m_modules[i].get_count(); j++) {
+        AddToList(m_tests, m_modules[i].get_name(j));
+      }
+    }
+  }
+
+  unsigned int num_passes = 0;
+  unsigned int num_failures = 0;
+
+  if (App::m_svcMsg) {
+    oclTestLog(OCLTEST_LOG_ALWAYS,
+               "##teamcity[testSuiteStarted name='ocltst']\n");
+  }
+
+  //
+  //  Run each test
+  //
+  for (unsigned int i = 0; i < m_modules.size(); i++) {
+    oclTestLog(OCLTEST_LOG_ALWAYS,
+               "\n-------------------------------------------------\n");
+    oclTestLog(OCLTEST_LOG_ALWAYS,
+               "The OpenCL Testing Module %s Version = %d \n",
+               m_modules[i].get_libname(), m_modules[i].get_version());
+    oclTestLog(OCLTEST_LOG_ALWAYS, "------------------------------\n");
+
+    // array to keep track of order of test execution.
+    int test_count = m_modules[i].get_count();
+    mp_testOrder = new int[test_count];
+    memset((void*)mp_testOrder, 0, sizeof(*mp_testOrder) * test_count);
+    SetTestRunOrder(i);
+
+    //
+    //  List all tests first if the option was turned on
+    //
+    if (m_list) {
+      PrintTestOrder(i);
+      delete[] mp_testOrder;
+      continue;
+      // return;
+    }
+
+    for (unsigned int itr_var = 0; itr_var < m_modules[i].get_count();
+         itr_var++) {
+      // done for random order generation
+      unsigned int subtest = mp_testOrder[itr_var];
+
+      const char* name = m_modules[i].get_name(subtest);
+      if (itr_var < m_tests.size() && TestInList(m_tests, name)) {
+        OCLTest* pt = NULL;
+        if (m_modules[i].cached_test) {
+          pt = m_modules[i].cached_test[subtest];
+        }
+        // Try to use the cached version first!
+        if (!pt) {
+          pt = m_modules[i].create_test(subtest);
+          if (pt->cache_test() && m_modules[i].cached_test) {
+            m_modules[i].cached_test[subtest] = pt;
+          }
+        }
+
+        int numSubTests = pt->getNumSubTests();
+        assert(numSubTests > 0);
+
+        TestIndexList testIndices;
+        GetTestIndexList(testIndices, m_tests, name, numSubTests - 1);
+
+        TestIndexList avoidIndices;
+        GetTestIndexList(avoidIndices, m_avoid, name, numSubTests - 1);
+
+        TestIndexList erasedIndices;
+        PruneTestIndexList(testIndices, avoidIndices, erasedIndices);
+
+        int numTestsRun = 0;
+        for (unsigned int j = 0; j < testIndices.size(); j++) {
+          unsigned int test = testIndices[j];
+
+          WaitAllThreads();
+          AddWorkerThread(i, subtest, test, pt->getThreadUsage(), runSubtest);
+
+          for (unsigned int thread = 1;
+               (thread < m_threads) && (thread < m_modules.size()); thread++) {
+            AddWorkerThread(thread, subtest, test, pt->getThreadUsage(),
+                            dummyThread);
+          }
+
+          numTestsRun++;
+        }
+
+        WaitAllThreads();
+        // Printing the test report
+        // First checking whether the number of subtests is greater than 1.
+        // No point printing report for a one subtest test
+
+        if (numTestsRun > 0) {
+          if (testReport[0]->success) {
+            num_passes++;
+          } else {
+            num_failures++;
+          }
+        }
+        if (App::m_svcMsg) {
+          for (unsigned int j = 0; j < erasedIndices.size(); j++) {
+            oclTestLog(OCLTEST_LOG_ALWAYS,
+                       "##teamcity[testIgnored name='%s.%s.%d']\n",
+                       m_modules[i].get_libname(), name, erasedIndices[j]);
+          }
+        }
+
+        // Resetting the values of the test reports
+        // for(unsigned int j = 0; j < m_numDevices; j++)
+        for (unsigned int j = 0; j < 1; j++) {
+          testReport[j]->reset();
+        }
+        m_modules[i].destroy_test(pt);
+        if (m_modules[i].cached_test) {
+          m_modules[i].cached_test[subtest] = NULL;
+        }
+      }
+    }
+
+    // print the order in which the test are executed if they are
+    // randomized.
+    if (m_rndOrder) {
+      PrintTestOrder(i);
+    }
+    // deleting the test order
+    delete[] mp_testOrder;
+  }
+
+  if (App::m_svcMsg) {
+    oclTestLog(OCLTEST_LOG_ALWAYS,
+               "##teamcity[testSuiteFinished name='ocltst']\n");
+  }
+
+#ifdef ATI_OS_WIN
+  if (!m_console && m_window) {
+    ((Window*)m_window)->ConsumeEvents();
+  }
+#endif
+  float total_tests = (float)(num_passes + num_failures);
+
+  float percent_passed = 0.0f;
+  float percent_failed = 0.0f;
+  float percent_total = 0.0f;
+  if (total_tests > 0) {
+    percent_passed = 100.0f * ((float)num_passes / total_tests);
+    percent_failed = 100.0f * ((float)num_failures / total_tests);
+    percent_total = 100.0f * ((float)total_tests / total_tests);
+  }
+
+  oclTestLog(OCLTEST_LOG_ALWAYS, "\n\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS, "----------------------------------------\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS, "Total Passed Tests:  %8d (%6.2f%s)\n",
+             num_passes, percent_passed, "%");
+  oclTestLog(OCLTEST_LOG_ALWAYS, "Total Failed Tests:  %8d (%6.2f%s)\n",
+             num_failures, percent_failed, "%");
+  oclTestLog(OCLTEST_LOG_ALWAYS, "----------------------------------------\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS, "Total Run Tests:     %8d (%6.2f%s)\n",
+             (int)total_tests, percent_total, "%");
+  oclTestLog(OCLTEST_LOG_ALWAYS, "\n\n");
+}
+
+/////////////////////////////////////////////////////////////////////////////
+
+void App::AddToList(StringList& strlist, const char* str) {
+  std::string s(str);
+
+  strlist.push_back(s);
+}
+
+void App::LoadList(StringList& strlist, const char* filename) {
+  char buffer[1024];
+
+  FILE* fp = fopen(filename, "r");
+
+  if (fp == NULL) return;
+
+  while (fgets(buffer, 1000, fp) != NULL) {
+    size_t length = strlen(buffer);
+    if (length > 0) {
+      if (buffer[length - 1] != '\n') {
+        length++;
+      }
+      buffer[length - 1] = 0;
+      AddToList(strlist, buffer);
+    }
+  }
+
+  fclose(fp);
+}
+
+static void Help(const char* name) {
+  oclTestLog(OCLTEST_LOG_ALWAYS,
+             "%s (-w | -v | -m | -M | -l | -t | -T | -p | -d | -x | -y | -g| "
+             "-o | -n )\n",
+             name);
+  oclTestLog(OCLTEST_LOG_ALWAYS, "   -w            : enable window mode\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS,
+             "   -v            : enable TeamCity service messages\n");
+  oclTestLog(
+      OCLTEST_LOG_ALWAYS,
+      "   -d            : dump test output to portable float map (pfm)\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS,
+             "   -m <module>   : specify a DLL module with tests\n");
+  oclTestLog(
+      OCLTEST_LOG_ALWAYS,
+      "   -M <filename> : specify a text file with one DLL module per line\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS,
+             "   -l            : list test names in DLL modules and exit\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS,
+             "   -s <count>    : number of threads to spawn\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS, "   -t <testname> : run test\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS,
+             "   -T <filename> : specify a text file with one test per line\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS,
+             "   -a <testname> : specify a test to avoid\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS,
+             "   -A <filename> : specify a text file of tests to avoid with "
+             "one test per line\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS,
+             "   -p <platform> : specify a platform to run on, 'amd','nvidia' "
+             "or 'intel'\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS, "   -h            : this help text\n");
+  oclTestLog(
+      OCLTEST_LOG_ALWAYS,
+      "   -x            : x dimension for debug output image (and window)\n");
+  oclTestLog(
+      OCLTEST_LOG_ALWAYS,
+      "   -y            : y dimension for debug output image (and window)\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS,
+             "   -P            : Perflab mode (just print the result without "
+             "any supplementary information)\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS,
+             "   -n #number    : run the tests specified with -m, -M, -t or -T "
+             "options multiple times\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS,
+             "   -r            : Option to Randomize the order in which the "
+             "tests are executed.\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS,
+             "   -R            : Option to ReRun failed tests for conditional "
+             "pass.\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS,
+             "   -i            : Don't print system information\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS,
+             "   -g <GPUid>    : GPUid to run the tests on\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS,
+             "   -o <filename> : dump the output to a specified file\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS,
+             "   -c            : Run the test on the CPU device.\n");
+  oclTestLog(OCLTEST_LOG_ALWAYS, "                 : \n");
+  oclTestLog(OCLTEST_LOG_ALWAYS,
+             "                 : To run only one subtest of a test, append the "
+             "subtest to\n");
+  oclTestLog(
+      OCLTEST_LOG_ALWAYS,
+      "                 : the end of the test name in brackets. i.e. test[1]");
+  oclTestLog(OCLTEST_LOG_ALWAYS, "\n");
+
+  exit(0);
+}
+
+unsigned int getPlatformID(const char* str) {
+  std::string strOfCLVendor(str);
+  std::string strOfCLPlatformName;
+  unsigned int platform = 0;
+
+  // currently, the only input values amd,nvidia and intel are supported
+  if (strOfCLVendor == "amd") {
+    strOfCLPlatformName = "Advanced Micro Devices, Inc.";
+  } else if (strOfCLVendor == "intel") {
+    strOfCLPlatformName = "Intel(R) Corporation";
+  } else if (strOfCLVendor == "nvidia") {
+    strOfCLPlatformName = "NVIDIA Corporation";
+  } else {
+    // fall-back on platform index 0
+    return platform;
+  }
+
+  cl_int status;
+  cl_uint numPlatforms = 0;
+
+  status = clGetPlatformIDs(0, NULL, &numPlatforms);
+  if (status != CL_SUCCESS) {
+    return platform;
+  }
+
+  cl_platform_id* platforms = new cl_platform_id[numPlatforms];
+  status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+
+  if (status == CL_SUCCESS) {
+    unsigned int i;
+    for (i = 0; i < numPlatforms; ++i) {
+      char buff[200];
+      status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(buff),
+                                 buff, NULL);
+      if (status != CL_SUCCESS) {
+        break;
+      }
+      if (strcmp(buff, strOfCLPlatformName.c_str()) == 0) {
+        platform = i;
+        break;
+      }
+    }
+  }
+
+  delete[] platforms;
+  return platform;
+}
+
+unsigned int parseCommandLineForPlatform(unsigned int argc, char** argv) {
+  int c;
+  unsigned int platform = 0;
+
+  while ((c = getopt(argc, argv, "dg:lm:M:o:Ps:t:T:a:A:p:v:wxy:in:rcRV")) !=
+         -1) {
+    switch (c) {
+      case 'p':
+        platform = getPlatformID(optarg);
+        break;
+      default:
+        break;
+    }
+  }
+  return platform;
+}
+
+void App::CommandLine(unsigned int argc, char** argv) {
+  unsigned int i = 1;
+  int c;
+  bool hasOption = false;
+  unsigned int tmpNumDevices = 0;
+  unsigned int tmpDeviceId = 0;
+  m_deviceId = 0;
+  int tmp;
+
+  while ((c = getopt(argc, argv, "dg:lm:M:o:Ps:t:T:a:A:p:v:wxy:in:rcRV")) !=
+         -1) {
+    switch (c) {
+      case 'c':
+        m_useCPU = true;
+        break;
+
+      case 'p':
+        break;
+
+      case 'w':
+        m_console = false;
+        hasOption = true;
+        break;
+
+      case 'V':
+        m_svcMsg = true;
+        break;
+
+      case 'd':
+        m_dump = true;
+        hasOption = true;
+        break;
+
+      case 'm':
+        AddToList(m_paths, optarg);
+        hasOption = true;
+        break;
+
+      case 'M':
+        LoadList(m_paths, optarg);
+        hasOption = true;
+        break;
+
+      case 'a':
+        AddToList(m_avoid, optarg);
+        hasOption = true;
+        break;
+
+      case 'A':
+        LoadList(m_avoid, optarg);
+        hasOption = true;
+        break;
+
+      case 'l':
+        m_list = true;
+        hasOption = true;
+        break;
+
+      // command line switch to loop execution of any specified test or tests n
+      // number of times
+      case 'n':
+        m_numItr = atoi(optarg);
+        break;
+
+      // command line switch to randomize the order of test execution in OCLTest
+      case 'r':
+        m_rndOrder = true;
+        break;
+
+      // command line switch to rerun the failed tests to see if they pass on
+      // second run
+      case 'R': {
+        m_reRunFailed = true;
+        break;
+      }
+      case 't':
+        AddToList(m_tests, optarg);
+        hasOption = true;
+        break;
+
+      case 'T':
+        LoadList(m_tests, optarg);
+        hasOption = true;
+        break;
+
+      case 's':
+        m_threads = atoi(optarg);
+        hasOption = true;
+        break;
+
+      case 'h':
+        Help(argv[0]);
+        break;
+
+      case 'x':
+        m_width = atoi(optarg);
+        hasOption = true;
+        break;
+
+      case 'y':
+        m_height = atoi(optarg);
+        hasOption = true;
+        break;
+
+      case 'P':
+        m_perflab = true;
+        hasOption = true;
+        break;
+      case 'g':
+#if 0
+            tmpNumDevices = (unsigned int)atoi(optarg);
+            if(m_numDevices < tmpNumDevices)
+            {
+                oclTestLog(OCLTEST_LOG_ALWAYS, "Number of Devices(%d) less than specified by the user(%d).  Using %d devices.\n", m_numDevices, tmpNumDevices, m_numDevices);
+            }
+            else
+            {
+                m_numDevices = tmpNumDevices;
+            }
+#else
+        tmpDeviceId = (unsigned int)atoi(optarg);
+#endif
+        break;
+      case 'v':
+        tmp = atoi(optarg);
+        if (tmp >= 0 && tmp < 100) {
+          oclTestSetLogLevel(atoi(optarg));
+        } else {
+          oclTestLog(OCLTEST_LOG_ALWAYS, "Invalid verbose level\n");
+        }
+        break;
+      case 'o': {
+        hasOption = true;
+        oclTestEnableLogToFile(optarg);
+      } break;
+      case 'i':
+        m_noSysInfoPrint = true;
+        break;
+      default:
+        Help(argv[0]);
+        break;
+    }
+  }
+
+  // Reset devices in case user overrode defaults
+  m_numDevices = findAdapters(m_platform, m_useCPU, &mpform_id);
+  if (m_numDevices < (tmpDeviceId + 1)) {
+    m_deviceId = 0;
+    oclTestLog(OCLTEST_LOG_ALWAYS,
+               "User specified deviceId(%d) exceedes the number of "
+               "Devices(%d).  Using device %d.\n",
+               tmpDeviceId, m_numDevices, m_deviceId);
+  } else {
+    m_deviceId = tmpDeviceId;
+  }
+
+  if (!hasOption) {
+    Help(argv[0]);
+  }
+}
+
+bool App::TestInList(StringList& strlist, const char* szModuleTestname) {
+  if (szModuleTestname == NULL) {
+    return false;
+  }
+  for (unsigned int i = 0; i < strlist.size(); i++) {
+    // check to see if an index is specified for this test name
+    int nIndex = -1;
+    std::string szTestName = strlist[i];
+    if (szTestName.find("[") != std::string::npos) {
+      size_t nFirstBracket = szTestName.find("[");
+      size_t nLastBracket = szTestName.find("]");
+      if ((nFirstBracket != std::string::npos) &&
+          (nLastBracket != std::string::npos) &&
+          (nLastBracket > nFirstBracket)) {
+        szTestName = szTestName.substr(0, nFirstBracket);
+      }
+    }
+    if (strcmp(szModuleTestname, szTestName.c_str()) == 0) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void App::GetTestIndexList(TestIndexList& testIndices, StringList& testList,
+                           const char* szModuleTestname, int maxIndex) {
+  for (unsigned int i = 0; i < testList.size(); i++) {
+    IndicesRange nIndex = {0, maxIndex};
+
+    // If the test name string ends with [...] parse the text
+    // between the brackets to determine the index range.
+    std::string szTestName = testList[i];
+    if (szTestName.find("[") != std::string::npos) {
+      size_t nFirstBracket = szTestName.find("[");
+      size_t nLastBracket = szTestName.find("]");
+      if ((nFirstBracket != std::string::npos) &&
+          (nLastBracket != std::string::npos) &&
+          (nLastBracket > nFirstBracket)) {
+        // Getting the string between the brackets '[' and ']'
+        // The values can be one of the following:-
+        // [a-b] - Run tests from a to b
+        // [a-] - Run tests from subtest a to subtest total_tests
+        // [-b] - Run tests from subtest 0 to subtest b
+        // a and b are indices of the tests to run
+
+        std::string nIndexString = szTestName.substr(
+            nFirstBracket + 1, nLastBracket - nFirstBracket - 1);
+        size_t nIntermediateHyphen = szTestName.find("-");
+        if ((nIntermediateHyphen != std::string::npos) &&
+            (nIntermediateHyphen < nLastBracket) &&
+            (nIntermediateHyphen > nFirstBracket)) {
+          // Getting the start index
+          if ((nIntermediateHyphen - 1) == nFirstBracket) {
+            nIndex.startIndex = 0;
+          } else {
+            nIndex.startIndex =
+                atoi(szTestName
+                         .substr(nFirstBracket + 1,
+                                 nIntermediateHyphen - nFirstBracket - 1)
+                         .c_str());
+          }
+
+          // Getting the end index
+          if ((nIntermediateHyphen + 1) == nLastBracket) {
+            nIndex.endIndex = maxIndex;
+          } else {
+            nIndex.endIndex =
+                atoi(szTestName
+                         .substr(nIntermediateHyphen + 1,
+                                 nLastBracket - nIntermediateHyphen - 1)
+                         .c_str());
+          }
+        } else {
+          nIndex.startIndex = atoi(
+              szTestName
+                  .substr(nFirstBracket + 1, nLastBracket - nFirstBracket - 1)
+                  .c_str());
+          nIndex.endIndex = nIndex.startIndex;
+        }
+      }
+
+      szTestName = szTestName.substr(0, nFirstBracket);
+    }
+
+    if (strcmp(szModuleTestname, szTestName.c_str()) == 0) {
+      // If the values are out of order, swap them.
+      if (nIndex.startIndex > nIndex.endIndex) {
+        int tmp = nIndex.startIndex;
+        nIndex.startIndex = nIndex.endIndex;
+        nIndex.endIndex = tmp;
+      }
+
+      // Add the indices in the specified range to the list.
+      for (int i = nIndex.startIndex; i <= nIndex.endIndex; ++i) {
+        if (i <= maxIndex) {
+          testIndices.push_back(i);
+        } else {
+          oclTestLog(OCLTEST_LOG_ALWAYS,
+                     "Error: Invalid test index for subtest: %s!\n",
+                     szModuleTestname);
+        }
+      }
+
+      // Now sort and prune duplicates.
+      std::sort(testIndices.begin(), testIndices.end());
+      std::unique(testIndices.begin(), testIndices.end());
+    }
+  }
+}
+
+void App::PruneTestIndexList(TestIndexList& testIndices,
+                             TestIndexList& avoidIndices,
+                             TestIndexList& erasedIndices) {
+  for (TestIndexList::iterator it = testIndices.begin();
+       it != testIndices.end();) {
+    unsigned int index = *it;
+    TestIndexList::iterator result =
+        std::find(avoidIndices.begin(), avoidIndices.end(), index);
+    if (result != avoidIndices.end()) {
+      it = testIndices.erase(it);
+      erasedIndices.push_back(index);
+    } else {
+      ++it;
+    }
+  }
+}
+
+void App::ScanForTests() {
+  for (unsigned int i = 0; i < m_paths.size(); i++) {
+    Module mod;
+
+#ifdef ATI_OS_WIN
+    std::string::iterator myIter;
+    myIter = m_paths[i].end();
+    myIter--;
+    if (*myIter == 0x0a) m_paths[i].erase(myIter);
+
+    mod.hmodule = LoadLibrary(m_paths[i].c_str());
+#endif
+#ifdef ATI_OS_LINUX
+    mod.hmodule = dlopen(m_paths[i].c_str(), RTLD_NOW);
+#endif
+
+    if (mod.hmodule == NULL) {
+      fprintf(stderr, "Could not load module: %s\n", m_paths[i].c_str());
+#ifdef ATI_OS_LINUX
+      fprintf(stderr, "Error : %s\n", dlerror());
+#else
+#endif
+    } else {
+      mod.name = m_paths[i];
+
+#ifdef ATI_OS_WIN
+      mod.get_count = (TestCountFuncPtr)GetProcAddress(mod.hmodule,
+                                                       "OCLTestList_TestCount");
+      mod.get_name =
+          (TestNameFuncPtr)GetProcAddress(mod.hmodule, "OCLTestList_TestName");
+      mod.create_test = (CreateTestFuncPtr)GetProcAddress(
+          mod.hmodule, "OCLTestList_CreateTest");
+      mod.destroy_test = (DestroyTestFuncPtr)GetProcAddress(
+          mod.hmodule, "OCLTestList_DestroyTest");
+      mod.get_version = (TestVersionFuncPtr)GetProcAddress(
+          mod.hmodule, "OCLTestList_TestLibVersion");
+      mod.get_libname = (TestLibNameFuncPtr)GetProcAddress(
+          mod.hmodule, "OCLTestList_TestLibName");
+#endif
+#ifdef ATI_OS_LINUX
+      mod.get_count =
+          (TestCountFuncPtr)dlsym(mod.hmodule, "OCLTestList_TestCount");
+      mod.get_name =
+          (TestNameFuncPtr)dlsym(mod.hmodule, "OCLTestList_TestName");
+      mod.create_test =
+          (CreateTestFuncPtr)dlsym(mod.hmodule, "OCLTestList_CreateTest");
+      mod.destroy_test =
+          (DestroyTestFuncPtr)dlsym(mod.hmodule, "OCLTestList_DestroyTest");
+      mod.get_version =
+          (TestVersionFuncPtr)dlsym(mod.hmodule, "OCLTestList_TestLibVersion");
+      mod.get_libname =
+          (TestLibNameFuncPtr)dlsym(mod.hmodule, "OCLTestList_TestLibName");
+#endif
+      mod.cached_test = new OCLTest*[mod.get_count()];
+      for (int x = 0, y = mod.get_count(); x < y; ++x) {
+        mod.cached_test[x] = NULL;
+      }
+      m_modules.push_back(mod);
+    }
+  }
+}
+
+void App::CleanUp() {
+  for (unsigned int i = 0; i < m_modules.size(); i++) {
+    if (m_modules[i].cached_test) {
+      delete[] m_modules[i].cached_test;
+    }
+#ifdef ATI_OS_WIN
+    FreeLibrary(m_modules[i].hmodule);
+#endif
+#ifdef ATI_OS_LINUX
+    dlclose(m_modules[i].hmodule);
+#endif
+  }
+
+#ifdef ATI_OS_WIN
+  if (m_window) delete m_window;
+  m_window = 0;
+#endif
+}
+
+extern int optind;
+/////////////////////////////////////////////////////////////////////////////
+bool App::m_reRunFailed = false;
+bool App::m_svcMsg = false;
+int main(int argc, char** argv) {
+  unsigned int platform = 0;
+  platform = parseCommandLineForPlatform(argc, argv);
+  // reset optind as we really didn't parse the full command line
+  optind = 0;
+  App app(platform);
+#ifdef ATI_OS_WIN
+  // this function is registers windows service routine when ocltst is launched
+  // by the OS on service initialization. On other scenarios, this function does
+  // nothing.
+  serviceStubCall();
+  // SetErrorMode(SEM_NOGPFAULTERRORBOX);
+  // const LPTOP_LEVEL_EXCEPTION_FILTER oldFilter =
+  // SetUnhandledExceptionFilter(xFilter);
+#endif  // ATI_OS_WIN
+#ifdef AUTO_REGRESS
+  try {
+#endif /* AUTO_REGRESS */
+    app.CommandLine(argc, argv);
+    app.printOCLinfo();
+    app.ScanForTests();
+    for (int i = 0; i < app.GetNumItr(); i++) {
+      app.RunAllTests();
+    }
+    app.CleanUp();
+#ifdef AUTO_REGRESS
+  } catch (...) {
+    oclTestLog(OCLTEST_LOG_ALWAYS, "Exiting due to unhandled exception!\n");
+    return (-1);
+  }
+#endif /* AUTO_REGRESS */
+
+  return 0;
+}
+
+#ifdef ATI_OS_WIN
+
+#include <dbghelp.h>
+
+typedef unsigned int uint32;
+typedef size_t uintp;
+
+struct StackEntry {
+  uintp addr;
+  uint32 line;
+  uint32 disp;
+  char symbol[128];
+  char file[128];
+};
+
+static const unsigned int MAX_DEPTH_PER_NODE = 24;
+struct Info {
+  bool operator==(const Info& b) const { return key == b.key; }
+
+  uintp key;  // pointer, handle, whatever
+  StackEntry stack[MAX_DEPTH_PER_NODE];
+};
+
+static void dumpTraceBack(CONTEXT& context) {
+  Info info;
+
+  oclTestLog(OCLTEST_LOG_ALWAYS, "Exception: exiting!\n");
+  HANDLE process = GetCurrentProcess();
+
+  STACKFRAME64 stackframe;
+  memset(&stackframe, 0, sizeof(STACKFRAME64));
+
+#if defined(_WIN64)
+  stackframe.AddrPC.Offset = context.Rip;
+  stackframe.AddrPC.Mode = AddrModeFlat;
+  stackframe.AddrStack.Offset = context.Rsp;
+  stackframe.AddrStack.Mode = AddrModeFlat;
+  stackframe.AddrFrame.Offset = context.Rbp;
+  stackframe.AddrFrame.Mode = AddrModeFlat;
+#else
+  stackframe.AddrPC.Offset = context.Eip;
+  stackframe.AddrPC.Mode = AddrModeFlat;
+  stackframe.AddrStack.Offset = context.Esp;
+  stackframe.AddrStack.Mode = AddrModeFlat;
+  stackframe.AddrFrame.Offset = context.Ebp;
+  stackframe.AddrFrame.Mode = AddrModeFlat;
+#endif
+  unsigned int depth = 0;
+
+  if (SymInitialize(process, NULL, true)) {
+    while ((depth < MAX_DEPTH_PER_NODE) &&
+           StackWalk64(IMAGE_FILE_MACHINE_I386, process, GetCurrentThread(),
+                       &stackframe, &context, NULL, SymFunctionTableAccess64,
+                       SymGetModuleBase64, NULL)) {
+      if (stackframe.AddrPC.Offset != 0) {
+        //
+        //  we don't want to evaluate the names/lines yet
+        //  so just record the address
+        //
+        info.stack[depth].addr = (uintp)stackframe.AddrPC.Offset;
+
+        DWORD64 disp64;
+        DWORD disp;
+        IMAGEHLP_SYMBOL64* symInfo;
+        IMAGEHLP_LINE64 lineInfo;
+        uintp addr = (uintp)stackframe.AddrPC.Offset;
+        char buffer[128];
+
+        symInfo = (IMAGEHLP_SYMBOL64*)&buffer[0];
+        symInfo->SizeOfStruct = sizeof(symInfo);
+        symInfo->MaxNameLength = (sizeof(buffer) - sizeof(IMAGEHLP_SYMBOL64));
+
+        lineInfo.SizeOfStruct = sizeof(lineInfo);
+
+        if (SymGetSymFromAddr64(process, addr, &disp64, symInfo)) {
+          sprintf(info.stack[depth].symbol, "%s", symInfo->Name);
+          info.stack[depth].disp = (uint32)disp64;
+        } else {
+          sprintf(info.stack[depth].symbol, "");
+        }
+
+        if (SymGetLineFromAddr64(process, addr, &disp, &lineInfo)) {
+          sprintf(info.stack[depth].file, "%s", lineInfo.FileName);
+          info.stack[depth].line = lineInfo.LineNumber;
+        } else {
+          info.stack[depth].file[0] = '\0';
+        }
+        depth++;
+      }
+    }
+  }
+
+  SymCleanup(process);
+
+  int j = 0;
+  while (j < MAX_DEPTH_PER_NODE && info.stack[j].addr != 0) {
+    oclTestLog(OCLTEST_LOG_ALWAYS, "        %s()+%d (0x%.8x)  %s:%d\n",
+               info.stack[j].symbol, info.stack[j].disp, info.stack[j].addr,
+               info.stack[j].file, info.stack[j].line);
+
+    j++;
+  }
+}
+
+static LONG WINAPI xFilter(LPEXCEPTION_POINTERS xEP) {
+  CONTEXT context;
+  CONTEXT* xCtx = &context;
+  memset(xCtx, 0, sizeof(CONTEXT));
+  context.ContextFlags = CONTEXT_FULL;
+  memcpy(xCtx, xEP->ContextRecord, sizeof(CONTEXT));
+
+  dumpTraceBack(context);
+
+  return (EXCEPTION_EXECUTE_HANDLER);
+}
+#undef CHECK_RESULT
+#endif  // WIN_OS
+
+/////////////////////////////////////////////////////////////////////////////
diff --git a/projects/clr/opencl/tests/ocltst/env/pfm.cpp b/projects/clr/opencl/tests/ocltst/env/pfm.cpp
new file mode 100644
index 0000000000..4e22fe1d8c
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/env/pfm.cpp
@@ -0,0 +1,79 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "pfm.h"
+
+#ifdef ATI_OS_WIN
+#include <io.h>
+#endif
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+unsigned int SavePFM(const char* filename, const float* buffer,
+                     unsigned int width, unsigned int height,
+                     unsigned int components) {
+  unsigned int error = 0;
+
+  //
+  // open the image file for writing
+  //
+  FILE* fh;
+  if ((fh = fopen(filename, "wb")) == NULL) {
+    return 1;
+  }
+
+  //
+  // write the PFM header
+  //
+#define PFMEOL "\x0a"
+  fprintf(fh, "PF" PFMEOL "%d %d" PFMEOL "-1" PFMEOL, width, height);
+  fflush(fh);
+
+  //
+  // write each scanline
+  //
+  const unsigned int lineSize = width * 3;
+  float line[3 * 4096];
+  for (unsigned int y = height; y > 0; y--) {
+    const float* v = buffer + components * width * (y - 1);
+    for (unsigned int x = 0; x < width; x++) {
+      line[x * 3 + 0] = v[x * components + 0];
+      line[x * 3 + 1] =
+          (components > 1) ? v[x * components + 1] : v[x * components + 0];
+      line[x * 3 + 2] =
+          (components > 2) ? v[x * components + 2] : v[x * components + 0];
+    }
+    unsigned int written =
+        (unsigned int)fwrite(line, (unsigned int)sizeof(float), lineSize, fh);
+    if (written != lineSize) {
+      error = 1;
+      break;
+    }
+    fflush(fh);
+  }
+  fflush(fh);
+  fclose(fh);
+
+  return error;
+}
diff --git a/projects/clr/opencl/tests/ocltst/env/pfm.h b/projects/clr/opencl/tests/ocltst/env/pfm.h
new file mode 100644
index 0000000000..60814d5f4d
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/env/pfm.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _PFM_H_
+#define _PFM_H_
+
+extern unsigned int SavePFM(const char* filename, const float* buffer,
+                            unsigned int width, unsigned int height,
+                            unsigned int components);
+
+#endif  // _PFM_H_
diff --git a/projects/clr/opencl/tests/ocltst/include/OCL/Thread.h b/projects/clr/opencl/tests/ocltst/include/OCL/Thread.h
new file mode 100644
index 0000000000..47ac2642c3
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/include/OCL/Thread.h
@@ -0,0 +1,148 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef OCL_THREAD_H
+#define OCL_THREAD_H
+
+//!
+//! \file Thread.h
+//!
+
+#ifdef ATI_OS_WIN
+#ifndef _WIN32_WINNT
+#define _WIN32_WINNT 0x0501
+#endif
+
+#include "windows.h"
+#else
+#include "pthread.h"
+#endif
+
+//! Entry point for the thread
+//! prototype of the entry point in windows
+typedef void *(*oclThreadFunc)(void *);
+
+namespace OCLutil {
+//! \class Lock
+//! \brief Provides a wrapper for locking primitives used to
+//!  synchronize _CPU_ threads.
+//!
+//! Common usage would be:
+//!
+//!    OCL::Lock lock;
+//!
+//!    ....
+//!
+//!    // Critical section begins
+//!
+//!    lock.lock();
+//!
+//!    .....
+//!
+//!    // Critical section ends
+//!
+//!    lock.unlock();
+//!
+
+class Lock {
+ public:
+  //! Constructor for OCLLock
+  Lock();
+
+  //! Destructor for OCLLock
+  ~Lock();
+
+  //! Try to acquire the lock, if available continue, else wait on the lock
+  void lock();
+
+  //! Try to acquire the lock, if available, hold it, else continue doing
+  //! something else
+  bool tryLock();
+
+  //! Unlock the lock and return
+  void unlock();
+
+ private:
+  /////////////////////////////////////////////////////////////
+  //!
+  //! Private data members and methods
+  //!
+
+  //! System specific synchronization primitive
+#ifdef ATI_OS_WIN
+  CRITICAL_SECTION _cs;
+#else
+  pthread_mutex_t _lock;
+#endif
+};
+
+//////////////////////////////////////////////////////////////
+//!
+//! \class Thread
+//! \brief Provides a wrapper for creating a _CPU_ thread.
+//!
+//! This class provides a simple wrapper to a CPU thread/
+//! The class name might be a bit confusing, esp considering
+//! the GPU has it's own threads as well.
+//!
+class Thread {
+ public:
+  //! Thread constructor and destructor. Note that the thread is
+  //! NOT created in the constructor. The thread creation takes
+  //! place in the create method
+  Thread();
+
+  ~Thread();
+
+  //! Wrapper for pthread_create. Pass the thread's entry
+  //! point and data to be passed to the routine
+  bool create(oclThreadFunc func, void *arg);
+
+  //! Wrapper for pthread_join. The calling thread
+  //! will wait until _this_ thread exits
+  bool join();
+
+  //! Get the thread data passed by the application
+  void *getData() { return _data; }
+
+  //! Get the thread ID
+  static unsigned int getID();
+
+ private:
+  /////////////////////////////////////////////////////////////
+  //!
+  //! Private data members and methods
+  //!
+
+#ifdef ATI_OS_WIN
+  //!  store the handle
+  HANDLE _tid;
+
+  unsigned int _ID;
+#else
+  pthread_t _tid;
+
+  pthread_attr_t _attr;
+#endif
+
+  void *_data;
+};
+};  // namespace OCLutil
+#endif
diff --git a/projects/clr/opencl/tests/ocltst/include/OCLLog.h b/projects/clr/opencl/tests/ocltst/include/OCLLog.h
new file mode 100644
index 0000000000..6b138eb030
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/include/OCLLog.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef OCLLOG_H_
+#define OCLLOG_H_
+
+#ifdef ATI_OS_WIN
+
+#ifdef OCLTST_LOG_BUILD
+#define DLLIMPORT __declspec(dllexport)
+#else
+#define DLLIMPORT __declspec(dllimport)
+#endif  // OCLTST_ENV_BUILD
+
+#else
+#define DLLIMPORT
+
+#endif  // ATI_OS_WIN
+
+enum oclLoggingLevel {
+  OCLTEST_LOG_ALWAYS,
+  OCLTEST_LOG_VERBOSE,
+};
+
+extern DLLIMPORT void oclTestLog(oclLoggingLevel logLevel, const char* fmt,
+                                 ...);
+extern DLLIMPORT void oclTestSetLogLevel(int level);
+extern DLLIMPORT void oclTestEnableLogToFile(const char* filename);
+
+#endif  // OCLLOG_H_
diff --git a/projects/clr/opencl/tests/ocltst/include/OCLTest.h b/projects/clr/opencl/tests/ocltst/include/OCLTest.h
new file mode 100644
index 0000000000..7923daccb4
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/include/OCLTest.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCLTEST_H_
+#define _OCLTEST_H_
+
+#include <string>
+
+#include "OCLWrapper.h"
+
+class BaseTestImp;
+class OCLTestImp;
+class OCLTest {
+ public:
+  virtual unsigned int getThreadUsage(void) = 0;
+  virtual int getNumSubTests(void) = 0;
+  virtual void open() = 0;
+  virtual void open(unsigned int test, const char* deviceName,
+                    unsigned int architecture) = 0;
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId, unsigned int platformIndex) = 0;
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId) = 0;
+
+  virtual void run(void) = 0;
+  virtual unsigned int close(void) = 0;
+  virtual void setErrorMsg(const char* error) = 0;
+  virtual const char* getErrorMsg(void) = 0;
+  virtual bool hasErrorOccured(void) = 0;
+  virtual void clearError() = 0;
+  virtual void setDeviceId(unsigned int deviceId) = 0;
+  virtual void setPlatformIndex(unsigned int platformIndex) = 0;
+  virtual OCLTestImp* toOCLTestImp() = 0;
+  virtual BaseTestImp* toBaseTestImp() = 0;
+  virtual float getPerfInfo() = 0;
+  virtual void clearPerfInfo(void) = 0;
+
+  virtual void setIterationCount(int cnt) = 0;
+  virtual void useCPU() = 0;
+  // Having this return true will allow the creation of the
+  // test to be cached in between runs and will only be
+  // deleted after all the tests are finished running.
+  // This defaults to false as not many tests are modified
+  // to use it.
+  // FIXME: Switch all tests to support caching.
+  virtual bool cache_test() { return true; }
+
+  std::string testDescString;
+  void resetDescString(void) { testDescString.clear(); }
+
+  virtual ~OCLTest(){};
+};
+
+#endif  // _OCLTEST_H_
diff --git a/projects/clr/opencl/tests/ocltst/include/OCLTestList.h b/projects/clr/opencl/tests/ocltst/include/OCLTestList.h
new file mode 100644
index 0000000000..ad39837623
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/include/OCLTestList.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCLMODULE_H_
+#define _OCLMODULE_H_
+
+#ifdef ATI_OS_WIN
+#define OCLLCONV __cdecl
+#endif
+#ifdef ATI_OS_LINUX
+#define OCLLCONV
+#endif
+
+class OCLTest;
+
+//
+//  exported function pointer typedefs
+//
+typedef unsigned int(OCLLCONV *TestCountFuncPtr)(void);
+typedef const char *(OCLLCONV *TestNameFuncPtr)(unsigned int);
+typedef OCLTest *(OCLLCONV *CreateTestFuncPtr)(unsigned int);
+typedef void(OCLLCONV *DestroyTestFuncPtr)(OCLTest *);
+typedef unsigned int(OCLLCONV *TestVersionFuncPtr)(void);
+typedef const char *(OCLLCONV *TestLibNameFuncPtr)(void);
+
+#endif  // _OCLMODULE_H_
diff --git a/projects/clr/opencl/tests/ocltst/include/OCLTestUtils.h b/projects/clr/opencl/tests/ocltst/include/OCLTestUtils.h
new file mode 100644
index 0000000000..ea1565afc2
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/include/OCLTestUtils.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef OCLTESTUTILS_H_
+#define OCLTESTUTILS_H_
+#include <string>
+
+// @param FN Name of the file to be loaded
+// @param S String to store the loaded file
+// @brief Load file to a string
+// @return true on success
+bool loadFile(const char* FN, std::string& S);
+
+#endif /* OCLTESTUTILS_H_ */
diff --git a/projects/clr/opencl/tests/ocltst/include/OCLWrapper.h b/projects/clr/opencl/tests/ocltst/include/OCLWrapper.h
new file mode 100644
index 0000000000..757dd84000
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/include/OCLWrapper.h
@@ -0,0 +1,614 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef __OCLWrapper_H
+#define __OCLWrapper_H
+
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+#include "CL/cl_gl.h"
+#include "cl_profile_amd.h"
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clUnloadPlatformAMD_fn)(
+    cl_platform_id id);
+
+// Function Pointer Declarations for cl_khr_gl_sharing extension (missing in
+// cl_gl.h)
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clGetGLContextInfoKHR_fn)(
+    const cl_context_properties *properties, cl_gl_context_info param_name,
+    size_t param_value_size, void *param_value, size_t *param_value_size_ret);
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *clCreateFromGLBuffer_fn)(
+    cl_context context, cl_mem_flags flags, unsigned int bufobj,
+    int *errcode_ret);
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *clCreateFromGLTexture_fn)(
+    cl_context context, cl_mem_flags flags, unsigned int texture_target,
+    int miplevel, unsigned int texture, cl_int *errcode_ret);
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *clCreateFromGLTexture2D_fn)(
+    cl_context context, cl_mem_flags flags, unsigned int texture_target,
+    int miplevel, unsigned int texture, cl_int *errcode_ret);
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *clCreateFromGLRenderbuffer_fn)(
+    cl_context context, cl_mem_flags flags, unsigned int renderbuffer,
+    cl_int *errcode_ret);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clGetGLObjectInfo_fn)(
+    cl_mem memobj, cl_gl_object_type *gl_object_type,
+    unsigned int *gl_object_name);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clGetGLTextureInfo_fn)(
+    cl_mem memobj, cl_gl_texture_info param_name, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clEnqueueAcquireGLObjects_fn)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clEnqueueReleaseGLObjects_fn)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+// Function Pointer Declarations for performance counters
+typedef CL_API_ENTRY cl_perfcounter_amd(CL_API_CALL *clCreatePerfCounterAMD_fn)(
+    cl_device_id device, cl_perfcounter_property *properties,
+    cl_int *errcode_ret);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clEnqueueBeginPerfCounterAMD_fn)(
+    cl_command_queue command_queue, cl_uint num_perf_counters,
+    cl_perfcounter_amd *perf_counters, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clEnqueueEndPerfCounterAMD_fn)(
+    cl_command_queue command_queue, cl_uint num_perf_counters,
+    cl_perfcounter_amd *perf_counters, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clGetPerfCounterInfoAMD_fn)(
+    cl_perfcounter_amd perf_counter, cl_perfcounter_info param_name,
+    size_t param_value_size, void *param_value, size_t *param_value_size_ret);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clReleasePerfCounterAMD_fn)(
+    cl_perfcounter_amd perf_counter);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clRetainPerfCounterAMD_fn)(
+    cl_perfcounter_amd perf_counter);
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL *clSetDeviceClockModeAMD_fn)(
+    cl_device_id device,
+    cl_set_device_clock_mode_input_amd set_clock_mode_input,
+    cl_set_device_clock_mode_output_amd *set_clock_mode_Output);
+
+class OCLWrapper {
+ public:
+  OCLWrapper();
+
+  ~OCLWrapper() {}
+
+  // All OCL APIs are declared in the order they appear in cl.h
+
+  cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms,
+                          cl_uint *num_platforms);
+
+  cl_int clGetPlatformInfo(cl_platform_id platform, cl_platform_info param_name,
+                           size_t param_value_size, void *param_value,
+                           size_t *param_value_size_ret);
+
+  cl_int clGetDeviceIDs(cl_platform_id platform, cl_device_type device_type,
+                        cl_uint num_entries, cl_device_id *devices,
+                        cl_uint *num_devices);
+
+  cl_int clGetDeviceInfo(cl_device_id device, cl_device_info param_name,
+                         size_t param_value_size, void *param_value,
+                         size_t *param_value_size_ret);
+
+  cl_context clCreateContext(cl_context_properties *properties,
+                             cl_uint num_devices, const cl_device_id *devices,
+                             void(CL_CALLBACK *pfn_notify)(const char *,
+                                                           const void *, size_t,
+                                                           void *),
+                             void *user_data, cl_int *errcode_ret);
+
+  cl_context clCreateContextFromType(
+      cl_context_properties *properties, cl_device_type device_type,
+      void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+      void *user_data, cl_int *errcode_ret);
+
+  cl_int clRetainContext(cl_context context);
+
+  cl_int clReleaseContext(cl_context context);
+
+  cl_int clGetContextInfo(cl_context context, cl_context_info param_name,
+                          size_t param_value_size, void *param_value,
+                          size_t *param_value_size_ret);
+
+  cl_command_queue clCreateCommandQueue(cl_context context, cl_device_id device,
+                                        cl_command_queue_properties properties,
+                                        cl_int *errcode_ret);
+
+  cl_int clRetainCommandQueue(cl_command_queue command_queue);
+
+  cl_int clReleaseCommandQueue(cl_command_queue command_queue);
+
+  cl_int clGetCommandQueueInfo(cl_command_queue command_queue,
+                               cl_command_queue_info param_name,
+                               size_t param_value_size, void *param_value,
+                               size_t *param_value_size_ret);
+
+  cl_mem clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size,
+                        void *host_ptr, cl_int *errcode_ret);
+
+  cl_mem clCreateImage2D(cl_context context, cl_mem_flags flags,
+                         const cl_image_format *image_format,
+                         size_t image_width, size_t image_height,
+                         size_t image_row_pitch, void *host_ptr,
+                         cl_int *errcode_ret);
+
+  cl_mem clCreateImage3D(cl_context context, cl_mem_flags flags,
+                         const cl_image_format *image_format,
+                         size_t image_width, size_t image_height,
+                         size_t image_depth, size_t image_row_pitch,
+                         size_t image_slice_pitch, void *host_ptr,
+                         cl_int *errcode_ret);
+
+  cl_int clRetainMemObject(cl_mem memobj);
+
+  cl_int clReleaseMemObject(cl_mem memobj);
+
+  cl_int clGetSupportedImageFormats(cl_context context, cl_mem_flags flags,
+                                    cl_mem_object_type image_type,
+                                    cl_uint num_entries,
+                                    cl_image_format *image_formats,
+                                    cl_uint *num_image_formats);
+
+  cl_int clGetMemObjectInfo(cl_mem memobj, cl_mem_info param_name,
+                            size_t param_value_size, void *param_value,
+                            size_t *param_value_size_ret);
+
+  cl_int clGetImageInfo(cl_mem image, cl_image_info param_name,
+                        size_t param_value_size, void *param_value,
+                        size_t *param_value_size_ret);
+
+  cl_sampler clCreateSampler(cl_context context, cl_bool normalized_coords,
+                             cl_addressing_mode addressing_mode,
+                             cl_filter_mode filter_mode, cl_int *errcode_ret);
+
+  cl_int clRetainSampler(cl_sampler sampler);
+
+  cl_int clReleaseSampler(cl_sampler sampler);
+
+  cl_int clGetSamplerInfo(cl_sampler sampler, cl_sampler_info param_name,
+                          size_t param_value_size, void *param_value,
+                          size_t *param_value_size_ret);
+
+  cl_program clCreateProgramWithSource(cl_context context, cl_uint count,
+                                       const char **strings,
+                                       const size_t *lengths,
+                                       cl_int *errcode_ret);
+
+  cl_program clCreateProgramWithBinary(cl_context context, cl_uint num_devices,
+                                       const cl_device_id *device_list,
+                                       const size_t *lengths,
+                                       const unsigned char **binaries,
+                                       cl_int *binary_status,
+                                       cl_int *errcode_ret);
+
+  cl_int clRetainProgram(cl_program program);
+
+  cl_int clReleaseProgram(cl_program program);
+
+  cl_int clBuildProgram(cl_program program, cl_uint num_devices,
+                        const cl_device_id *device_list, const char *options,
+                        void(CL_CALLBACK *pfn_notify)(cl_program program,
+                                                      void *user_data),
+                        void *user_data);
+
+  cl_int clCompileProgram(
+      cl_program program, cl_uint num_devices, const cl_device_id *device_list,
+      const char *options, cl_uint num_input_headers,
+      const cl_program *input_headers, const char **header_include_names,
+      void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+      void *user_data);
+
+  cl_program clLinkProgram(cl_context context, cl_uint num_devices,
+                           const cl_device_id *device_list, const char *options,
+                           cl_uint num_input_programs,
+                           const cl_program *input_programs,
+                           void(CL_CALLBACK *pfn_notify)(cl_program program,
+                                                         void *user_data),
+                           void *user_data, cl_int *errcode_ret);
+
+  cl_int clUnloadCompiler(void);
+
+  cl_int clUnloadPlatform(cl_platform_id);
+
+  cl_int clGetProgramInfo(cl_program program, cl_program_info param_name,
+                          size_t param_value_size, void *param_value,
+                          size_t *param_value_size_ret);
+
+  cl_int clGetProgramBuildInfo(cl_program program, cl_device_id device,
+                               cl_program_build_info param_name,
+                               size_t param_value_size, void *param_value,
+                               size_t *param_value_size_ret);
+
+  cl_kernel clCreateKernel(cl_program program, const char *kernel_name,
+                           cl_int *errcode_ret);
+
+  cl_int clCreateKernelsInProgram(cl_program program, cl_uint num_kernels,
+                                  cl_kernel *kernels, cl_uint *num_kernels_ret);
+
+  cl_int clRetainKernel(cl_kernel kernel);
+
+  cl_int clReleaseKernel(cl_kernel kernel);
+
+  cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size,
+                        const void *arg_value);
+
+  cl_int clGetKernelInfo(cl_kernel kernel, cl_kernel_info param_name,
+                         size_t param_value_size, void *param_value,
+                         size_t *param_value_size_ret);
+
+  cl_int clGetKernelWorkGroupInfo(cl_kernel kernel, cl_device_id device,
+                                  cl_kernel_work_group_info param_name,
+                                  size_t param_value_size, void *param_value,
+                                  size_t *param_value_size_ret);
+
+  cl_int clWaitForEvents(cl_uint num_events, const cl_event *event_list);
+
+  cl_int clGetEventInfo(cl_event evnt, cl_event_info param_name,
+                        size_t param_value_size, void *param_value,
+                        size_t *param_value_size_ret);
+
+  cl_int clRetainEvent(cl_event evnt);
+
+  cl_int clReleaseEvent(cl_event evnt);
+
+  cl_int clGetEventProfilingInfo(cl_event evnt, cl_profiling_info param_name,
+                                 size_t param_value_size, void *param_value,
+                                 size_t *param_value_size_ret);
+
+  cl_int clFlush(cl_command_queue command_queue);
+
+  cl_int clFinish(cl_command_queue command_queue);
+
+  cl_int clEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer,
+                             cl_bool blocking_read, size_t offset, size_t cb,
+                             void *ptr, cl_uint num_events_in_wait_list,
+                             const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clEnqueueWriteBuffer(cl_command_queue command_queue, cl_mem buffer,
+                              cl_bool blocking_write, size_t offset, size_t cb,
+                              const void *ptr, cl_uint num_events_in_wait_list,
+                              const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clEnqueueCopyBuffer(cl_command_queue command_queue, cl_mem src_buffer,
+                             cl_mem dst_buffer, size_t src_offset,
+                             size_t dst_offset, size_t cb,
+                             cl_uint num_events_in_wait_list,
+                             const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clEnqueueReadImage(cl_command_queue command_queue, cl_mem image,
+                            cl_bool blocking_read, const size_t *origin,
+                            const size_t *region, size_t row_pitch,
+                            size_t slice_pitch, void *ptr,
+                            cl_uint num_events_in_wait_list,
+                            const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clEnqueueWriteImage(cl_command_queue command_queue, cl_mem image,
+                             cl_bool blocking_write, const size_t *origin,
+                             const size_t *region, size_t input_row_pitch,
+                             size_t input_slice_pitch, const void *ptr,
+                             cl_uint num_events_in_wait_list,
+                             const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clEnqueueCopyImage(cl_command_queue command_queue, cl_mem src_image,
+                            cl_mem dst_image, const size_t *src_origin,
+                            const size_t *dst_origin, const size_t *region,
+                            cl_uint num_events_in_wait_list,
+                            const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
+                                    cl_mem src_image, cl_mem dst_buffer,
+                                    const size_t *src_origin,
+                                    const size_t *region, size_t dst_offset,
+                                    cl_uint num_events_in_wait_list,
+                                    const cl_event *event_wait_list,
+                                    cl_event *evnt);
+
+  cl_int clEnqueueCopyBufferToImage(cl_command_queue command_queue,
+                                    cl_mem src_buffer, cl_mem dst_image,
+                                    size_t src_offset, const size_t *dst_origin,
+                                    const size_t *region,
+                                    cl_uint num_events_in_wait_list,
+                                    const cl_event *event_wait_list,
+                                    cl_event *evnt);
+
+  void *clEnqueueMapBuffer(cl_command_queue command_queue, cl_mem buffer,
+                           cl_bool blocking_map, cl_map_flags map_flags,
+                           size_t offset, size_t cb,
+                           cl_uint num_events_in_wait_list,
+                           const cl_event *event_wait_list, cl_event *evnt,
+                           cl_int *errcode_ret);
+
+  void *clEnqueueMapImage(cl_command_queue command_queue, cl_mem image,
+                          cl_bool blocking_map, cl_map_flags map_flags,
+                          const size_t *origin, const size_t *region,
+                          size_t *image_row_pitch, size_t *image_slice_pitch,
+                          cl_uint num_events_in_wait_list,
+                          const cl_event *event_wait_list, cl_event *evnt,
+                          cl_int *errcode_ret);
+
+  cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue, cl_mem memobj,
+                                 void *mapped_ptr,
+                                 cl_uint num_events_in_wait_list,
+                                 const cl_event *event_wait_list,
+                                 cl_event *evnt);
+
+  cl_int clEnqueueNDRangeKernel(
+      cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim,
+      const size_t *global_work_offset, const size_t *global_work_size,
+      const size_t *local_work_size, cl_uint num_events_in_wait_list,
+      const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clEnqueueTask(cl_command_queue command_queue, cl_kernel kernel,
+                       cl_uint num_events_in_wait_list,
+                       const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clEnqueueNativeKernel(cl_command_queue command_queue,
+                               void(CL_CALLBACK *user_func)(void *), void *args,
+                               size_t cb_args, cl_uint num_mem_objects,
+                               const cl_mem *mem_list,
+                               const void **args_mem_loc,
+                               cl_uint num_events_in_wait_list,
+                               const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clEnqueueMarker(cl_command_queue command_queue, cl_event *evnt);
+
+  cl_int clEnqueueMarkerWithWaitList(cl_command_queue command_queue,
+                                     cl_uint num_events_in_wait_list,
+                                     const cl_event *event_wait_list,
+                                     cl_event *evnt);
+
+  cl_int clEnqueueWaitForEvents(cl_command_queue command_queue,
+                                cl_uint num_events, const cl_event *event_list);
+
+  cl_int clEnqueueBarrier(cl_command_queue command_queue);
+
+  void *clGetExtensionFunctionAddress(const char *func_name);
+
+  cl_int clEnqueueReadBufferRect(
+      cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
+      const size_t *buffer_origin, const size_t *host_origin,
+      const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
+      size_t host_row_pitch, size_t host_slice_pitch, void *ptr,
+      cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+      cl_event *evnt);
+
+  cl_int clEnqueueWriteBufferRect(
+      cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
+      const size_t *buffer_origin, const size_t *host_origin,
+      const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
+      size_t host_row_pitch, size_t host_slice_pitch, const void *ptr,
+      cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+      cl_event *evnt);
+
+  cl_int clEnqueueCopyBufferRect(
+      cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
+      const size_t *src_origin, const size_t *dst_origin, const size_t *region,
+      size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch,
+      size_t dst_slice_pitch, cl_uint num_events_in_wait_list,
+      const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_mem clCreateImage(cl_context context, cl_mem_flags flags,
+                       const cl_image_format *image_format,
+                       const cl_image_desc *image_desc, void *host_ptr,
+                       cl_int *errcode_ret);
+
+  cl_mem clCreateSubBuffer(cl_mem mem, cl_mem_flags flags,
+                           cl_buffer_create_type buffer_create_type,
+                           const void *buffer_create_info, cl_int *errcode_ret);
+
+  cl_int clSetEventCallback(
+      cl_event event, cl_int command_exec_callback_type,
+      void(CL_CALLBACK *pfn_event_notify)(cl_event event,
+                                          cl_int event_command_exec_status,
+                                          void *user_data),
+      void *user_data);
+
+  cl_int clEnqueueFillImage(cl_command_queue command_queue, cl_mem image,
+                            void *ptr, const size_t *origin,
+                            const size_t *region,
+                            cl_uint num_events_in_wait_list,
+                            const cl_event *event_wait_list, cl_event *evnt);
+
+  cl_int clUnloadPlatformAMD(cl_platform_id id);
+
+  cl_int clEnqueueWaitSignalAMD(cl_command_queue command_queue,
+                                cl_mem mem_object, cl_uint value,
+                                cl_uint num_events,
+                                const cl_event *event_wait_list,
+                                cl_event *event);
+
+  cl_int clEnqueueWriteSignalAMD(cl_command_queue command_queue,
+                                 cl_mem mem_object, cl_uint value,
+                                 cl_ulong offset, cl_uint num_events,
+                                 const cl_event *event_list, cl_event *event);
+
+  cl_int clEnqueueMakeBuffersResidentAMD(
+      cl_command_queue command_queue, cl_uint num_mem_objs, cl_mem *mem_objects,
+      cl_bool blocking_make_resident, cl_bus_address_amd *bus_addresses,
+      cl_uint num_events, const cl_event *event_list, cl_event *event);
+
+  cl_int clEnqueueMigrateMemObjects(cl_command_queue command_queue,
+                                    cl_uint num_mem_objects,
+                                    const cl_mem *mem_objects,
+                                    cl_mem_migration_flags flags,
+                                    cl_uint num_events_in_wait_list,
+                                    const cl_event *event_wait_list,
+                                    cl_event *event);
+
+  // CL-GL Extension: cl_khr_gl_sharing
+  cl_int clGetGLContextInfoKHR(const cl_context_properties *properties,
+                               cl_gl_context_info param_name,
+                               size_t param_value_size, void *param_value,
+                               size_t *param_value_size_ret);
+
+  cl_mem clCreateFromGLBuffer(cl_context context, cl_mem_flags flags,
+                              unsigned int bufobj, int *errcode_ret);
+
+  cl_mem clCreateFromGLTexture(cl_context context, cl_mem_flags flags,
+                               unsigned int texture_target, int miplevel,
+                               unsigned int texture, cl_int *errcode_ret);
+
+  cl_mem clCreateFromGLTexture2D(cl_context context, cl_mem_flags flags,
+                                 unsigned int texture_target, int miplevel,
+                                 unsigned int texture, cl_int *errcode_ret);
+
+  cl_mem clCreateFromGLRenderbuffer(cl_context context, cl_mem_flags flags,
+                                    unsigned int renderbuffer,
+                                    cl_int *errcode_ret);
+
+  cl_int clGetGLObjectInfo(cl_mem memobj, cl_gl_object_type *gl_object_type,
+                           unsigned int *gl_object_name);
+
+  cl_int clGetGLTextureInfo(cl_mem memobj, cl_gl_texture_info param_name,
+                            size_t param_value_size, void *param_value,
+                            size_t *param_value_size_ret);
+
+  cl_int clEnqueueAcquireGLObjects(cl_command_queue command_queue,
+                                   cl_uint num_objects,
+                                   const cl_mem *mem_objects,
+                                   cl_uint num_events_in_wait_list,
+                                   const cl_event *event_wait_list,
+                                   cl_event *event);
+
+  cl_int clEnqueueReleaseGLObjects(cl_command_queue command_queue,
+                                   cl_uint num_objects,
+                                   const cl_mem *mem_objects,
+                                   cl_uint num_events_in_wait_list,
+                                   const cl_event *event_wait_list,
+                                   cl_event *event);
+
+#if defined(CL_VERSION_2_0)
+  cl_command_queue clCreateCommandQueueWithProperties(
+      cl_context context, cl_device_id device,
+      const cl_queue_properties *properties, cl_int *errcode_ret);
+
+  void *clSVMAlloc(cl_context context, cl_svm_mem_flags flags, size_t size,
+                   cl_uint alignment);
+
+  void clSVMFree(cl_context context, void *svm_pointer);
+
+  cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map,
+                         cl_map_flags flags, void *svm_ptr, size_t size,
+                         cl_uint num_events_in_wait_list,
+                         const cl_event *event_wait_list, cl_event *event);
+
+  cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, void *svm_ptr,
+                           cl_uint num_events_in_wait_list,
+                           const cl_event *event_wait_list, cl_event *event);
+
+  cl_int clEnqueueSVMMemFill(cl_command_queue command_queue, void *svm_ptr,
+                             const void *pattern, size_t pattern_size,
+                             size_t size, cl_uint num_events_in_wait_list,
+                             const cl_event *event_wait_list, cl_event *event);
+
+  cl_int clSetKernelArgSVMPointer(cl_kernel kernel, cl_uint arg_index,
+                                  const void *arg_value);
+
+  cl_mem clCreatePipe(cl_context context, cl_mem_flags flags,
+                      cl_uint packet_size, cl_uint num_packets,
+                      const cl_pipe_properties *properties,
+                      cl_int *errcode_ret);
+
+  cl_int clGetPipeInfo(cl_mem pipe, cl_pipe_info param_name,
+                       size_t param_value_size, void *param_value,
+                       size_t *param_value_size_ret);
+
+#endif
+
+  cl_perfcounter_amd clCreatePerfCounterAMD(cl_device_id device,
+                                            cl_perfcounter_property *properties,
+                                            cl_int *errcode_ret);
+
+  cl_int clEnqueueBeginPerfCounterAMD(cl_command_queue command_queue,
+                                      cl_uint num_perf_counters,
+                                      cl_perfcounter_amd *perf_counters,
+                                      cl_uint num_events_in_wait_list,
+                                      const cl_event *event_wait_list,
+                                      cl_event *event);
+
+  cl_int clEnqueueEndPerfCounterAMD(cl_command_queue command_queue,
+                                    cl_uint num_perf_counters,
+                                    cl_perfcounter_amd *perf_counters,
+                                    cl_uint num_events_in_wait_list,
+                                    const cl_event *event_wait_list,
+                                    cl_event *event);
+
+  cl_int clGetPerfCounterInfoAMD(cl_perfcounter_amd perf_counter,
+                                 cl_perfcounter_info param_name,
+                                 size_t param_value_size, void *param_value,
+                                 size_t *param_value_size_ret);
+
+  cl_int clReleasePerfCounterAMD(cl_perfcounter_amd perf_counter);
+
+  cl_int clRetainPerfCounterAMD(cl_perfcounter_amd perf_counter);
+
+  cl_int clSetDeviceClockModeAMD(
+      cl_device_id device,
+      cl_set_device_clock_mode_input_amd set_clock_mode_input,
+      cl_set_device_clock_mode_output_amd *set_clock_mode_Output);
+
+ private:
+  clEnqueueWaitSignalAMD_fn clEnqueueWaitSignalAMD_ptr;
+  clEnqueueWriteSignalAMD_fn clEnqueueWriteSignalAMD_ptr;
+  clEnqueueMakeBuffersResidentAMD_fn clEnqueueMakeBuffersResidentAMD_ptr;
+
+  // Unload the platform
+  clUnloadPlatformAMD_fn clUnloadPlatformAMD_ptr;
+
+  // CL-GL Extension: cl_khr_gl_sharing
+  clGetGLContextInfoKHR_fn clGetGLContextInfoKHR_ptr;
+  clCreateFromGLBuffer_fn clCreateFromGLBuffer_ptr;
+  clCreateFromGLTexture_fn clCreateFromGLTexture_ptr;
+  clCreateFromGLTexture2D_fn clCreateFromGLTexture2D_ptr;
+  clCreateFromGLRenderbuffer_fn clCreateFromGLRenderbuffer_ptr;
+  clGetGLObjectInfo_fn clGetGLObjectInfo_ptr;
+  clGetGLTextureInfo_fn clGetGLTextureInfo_ptr;
+  clEnqueueAcquireGLObjects_fn clEnqueueAcquireGLObjects_ptr;
+  clEnqueueReleaseGLObjects_fn clEnqueueReleaseGLObjects_ptr;
+
+  // Performance counters
+  clCreatePerfCounterAMD_fn clCreatePerfCounterAMD_ptr;
+  clEnqueueBeginPerfCounterAMD_fn clEnqueueBeginPerfCounterAMD_ptr;
+  clEnqueueEndPerfCounterAMD_fn clEnqueueEndPerfCounterAMD_ptr;
+  clGetPerfCounterInfoAMD_fn clGetPerfCounterInfoAMD_ptr;
+  clReleasePerfCounterAMD_fn clReleasePerfCounterAMD_ptr;
+  clRetainPerfCounterAMD_fn clRetainPerfCounterAMD_ptr;
+  // Set clockMode
+  clSetDeviceClockModeAMD_fn clSetDeviceClockModeAMD_ptr;
+};
+
+#endif
diff --git a/projects/clr/opencl/tests/ocltst/log/oclTestLog.cpp b/projects/clr/opencl/tests/ocltst/log/oclTestLog.cpp
new file mode 100644
index 0000000000..519833fd98
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/log/oclTestLog.cpp
@@ -0,0 +1,104 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "oclTestLog.h"
+
+#include <cassert>
+#include <cstring>
+
+#include "OCLLog.h"
+
+oclLog::oclLog()
+    : m_stdout_fp(stdout), m_filename(""), m_writeToFileIsEnabled(false) {}
+
+oclLog::~oclLog() { disable_write_to_file(); }
+
+void oclLog::enable_write_to_file(std::string filename) {
+  m_writeToFileIsEnabled = true;
+  m_filename = filename;
+  FILE* fp = fopen(m_filename.c_str(), "w");
+  if (fp == NULL) {
+    oclTestLog(OCLTEST_LOG_ALWAYS,
+               "ERROR: Cannot open file %s. Disabling logging to file.\n",
+               filename.c_str());
+    m_writeToFileIsEnabled = false;
+  } else {
+    fclose(fp);
+  }
+}
+
+void oclLog::disable_write_to_file() { m_writeToFileIsEnabled = false; }
+
+void oclLog::vprint(char const* fmt, va_list args) {
+  // hack for fixing the lnx64bit segfault and
+  // garbage printing in file. XXX 2048 a magic number
+  char buffer[4096];
+
+  memset(buffer, 0, sizeof(buffer));
+  int rc = vsnprintf(buffer, sizeof(buffer), fmt, args);
+  assert(rc >= 0 && rc != sizeof(buffer));
+
+  fputs(buffer, m_stdout_fp);
+  if (m_writeToFileIsEnabled) {
+    FILE* fp = fopen(m_filename.c_str(), "a");
+    if (fp == NULL) {
+      oclTestLog(OCLTEST_LOG_ALWAYS,
+                 "ERROR: Cannot open file %s. Disabling logging to file.\n",
+                 m_filename.c_str());
+      m_writeToFileIsEnabled = false;
+    }
+    fputs(buffer, fp);
+    fclose(fp);
+  }
+}
+
+void oclLog::flush() { fflush(m_stdout_fp); }
+
+static oclLog& theLog() {
+  static oclLog Log;
+  return Log;
+}
+
+static oclLoggingLevel currentLevel = OCLTEST_LOG_ALWAYS;
+static float logcount = 0.0f;
+
+void oclTestLog(oclLoggingLevel logLevel, const char* fmt, ...) {
+  logcount += 1.0f;
+
+  if (logLevel <= currentLevel) {
+    va_list args;
+    va_start(args, fmt);
+
+    theLog().vprint(fmt, args);
+    theLog().flush();
+
+    va_end(args);
+  }
+}
+
+void oclTestEnableLogToFile(const char* filename) {
+  theLog().enable_write_to_file(filename);
+}
+
+void oclTestSetLogLevel(int level) {
+  if (level >= 0) {
+    currentLevel = static_cast<oclLoggingLevel>(level);
+  }
+}
diff --git a/projects/clr/opencl/tests/ocltst/log/oclTestLog.h b/projects/clr/opencl/tests/ocltst/log/oclTestLog.h
new file mode 100644
index 0000000000..28953941ce
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/log/oclTestLog.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef CALTESTLOG_H_
+#define CALTESTLOG_H_
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#include <string>
+
+class oclLog {
+ public:
+  oclLog();
+  virtual ~oclLog();
+  virtual void vprint(char const* fmt, va_list args);
+  virtual void flush();
+  virtual void enable_write_to_file(std::string filename);
+  virtual void disable_write_to_file();
+
+ private:
+  FILE* m_stdout_fp;
+  std::string m_filename;
+  bool m_writeToFileIsEnabled;
+};
+
+#endif  // CALTESTLOG_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/common/BaseTestImp.cpp b/projects/clr/opencl/tests/ocltst/module/common/BaseTestImp.cpp
new file mode 100644
index 0000000000..5aa6ce2b34
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/common/BaseTestImp.cpp
@@ -0,0 +1,185 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "BaseTestImp.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cstring>
+
+/////////////////////////////////////////////////////////////////////////////
+
+static unsigned int crcinit(unsigned int crc);
+static int initializeSeed(void);
+
+/////////////////////////////////////////////////////////////////////////////
+
+BaseTestImp::BaseTestImp()
+    : _numSubTests(0), _openTest(0), _deviceName(NULL), _architecture(0) {
+  _cpu = false;
+  unsigned int i;
+  for (i = 0; i < 256; i++) {
+    _crctab[i] = crcinit(i << 24);
+  }
+  _crcword = ~0;
+  _deviceId = 0;
+  _platformIndex = 0;
+  _perfInfo = 0.0f;
+
+#ifdef ATI_OS_LINUX  //
+  _useThreads = 0;  // disable threads on linux
+#else
+  _useThreads = 1;  // if available on platform
+#endif
+
+  clearError();
+}
+
+void BaseTestImp::checkComplib(unsigned int test, const char *deviceName,
+                               unsigned int architecture) {
+  BaseTestImp::open();
+  devices_ = 0;
+  deviceCount_ = 0;
+  context_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  type_ = CL_DEVICE_TYPE_GPU;
+
+  cl_uint numPlatforms = 0;
+  error_ = clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clGetPlatformIDs failed");
+  CHECK_RESULT((numPlatforms == 0), "No platform found");
+
+  cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+  error_ = clGetPlatformIDs(numPlatforms, platforms, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+
+  cl_platform_id platform = 0;
+#if 0
+  for(unsigned int i = 0; i < numPlatforms; ++i)
+  {
+    char buff[200];
+    error_ = clGetPlatformInfo(platforms[i],CL_PLATFORM_VENDOR, sizeof(buff), buff, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+    if(strcmp(buff, "Advanced Micro Devices, Inc.") == 0)
+    {
+      platform = platforms[i];
+      break;
+    }
+  }
+#endif
+  platform = platforms[_platformIndex];
+
+  delete[] platforms;
+
+  CHECK_RESULT((platform == 0), "AMD Platform not found");
+
+  error_ = clGetDeviceIDs(platform, type_, 0, NULL, &deviceCount_);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed");
+
+  devices_ = new cl_device_id[deviceCount_];
+  error_ = clGetDeviceIDs(platform, type_, deviceCount_, devices_, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed");
+
+  char device_string[200];
+  clGetDeviceInfo(devices_[_deviceId], CL_DRIVER_VERSION, sizeof(device_string),
+                  &device_string, NULL);
+  if (strstr(device_string, "LC")) {
+    printf("Skipping test since it does not run with LC\n");
+    failed_ = true;
+    return;
+  }
+  return;
+}
+
+BaseTestImp::~BaseTestImp() {}
+
+void BaseTestImp::open() {
+  _crcword = 0;
+  clearError();
+}
+void BaseTestImp::open(unsigned int test, const char *deviceName,
+                       unsigned int architecture) {
+  open();
+}
+
+unsigned int BaseTestImp::close() { return _crcword; }
+
+unsigned int BaseTestImp::getThreadUsage(void) { return _useThreads; }
+
+int BaseTestImp::getNumSubTests(void) { return _numSubTests; }
+
+void BaseTestImp::setDeviceName(const char *name) { _deviceName = name; }
+
+const char *BaseTestImp::getDeviceName() { return _deviceName; }
+
+float BaseTestImp::getPerfInfo(void) { return _perfInfo; }
+
+void BaseTestImp::clearPerfInfo(void) { _perfInfo = 0.0; }
+
+void BaseTestImp::setDeviceId(unsigned int deviceId) { _deviceId = deviceId; }
+
+void BaseTestImp::setIterationCount(int cnt) { _iterationCnt = cnt; }
+
+unsigned int BaseTestImp::getDeviceId() { return _deviceId; }
+
+void BaseTestImp::setPlatformIndex(unsigned int platformIndex) {
+  _platformIndex = platformIndex;
+}
+
+unsigned int BaseTestImp::getPlatformIndex() { return _platformIndex; }
+
+void BaseTestImp::setErrorMsg(const char *error) {
+  _errorFlag = true;
+  _errorMsg.assign((const char *)error);
+}
+
+const char *BaseTestImp::getErrorMsg() { return _errorMsg.c_str(); }
+
+bool BaseTestImp::hasErrorOccured() { return _errorFlag; }
+
+void BaseTestImp::clearError() {
+  _errorFlag = false;
+  _errorMsg.clear();
+}
+
+/////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Same CRC32 as used by ogtst
+//
+static const unsigned int CRCMASK = 0x04c11db7;
+
+static unsigned int crcinit(unsigned int crc) {
+  int i;
+  unsigned int ans = crc;
+
+  for (i = 0; i < 8; i++) {
+    if (ans & 0x80000000) {
+      ans = (ans << 1) ^ CRCMASK;
+    } else {
+      ans <<= 1;
+    }
+  }
+  return (ans);
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommon.cpp b/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommon.cpp
new file mode 100644
index 0000000000..4cf7aa3289
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommon.cpp
@@ -0,0 +1,175 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLCommon.h"
+
+#include <cmath>
+#include <cstring>
+
+void OCLGLCommon::open(unsigned int test, char *units, double &conversion,
+                       unsigned int deviceId) {
+  // OpenCL Initialization
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test (%d)", error_);
+
+  char name[1024] = {0};
+  size_t size = 0;
+
+  if (deviceId >= deviceCount_) {
+    _errorFlag = true;
+    return;
+  }
+
+  // Check that the device supports CL/GL interop extension
+  _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 1024,
+                            name, &size);
+  if (!strstr(name, "cl_khr_gl_sharing")) {
+    printf("KHR GL sharing extension is required for this test!\n");
+    _errorFlag = true;
+    return;
+  }
+
+  // OpenGL Initialization
+  bool retVal = initializeGLContext(hGL_);
+  CHECK_RESULT((retVal == CL_SUCCESS), "Error opening test (%d)", error_);
+
+  createCLContextFromGLContext(hGL_);
+}
+
+bool OCLGLCommon::IsGLEnabled(unsigned int test, char *units,
+                              double &conversion, unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  bool bResult = initializeGLContext(hGL_);
+  if (bResult) {
+    deleteGLContext(hGL_);
+  }
+  OCLTestImp::close();
+  return bResult;
+}
+
+void OCLGLCommon::gluPerspective(double fovy, double aspect, double zNear,
+                                 double zFar) {
+  double xmin, xmax, ymin, ymax;
+  ymax = zNear * tan(fovy * 3.149 / 360.0);
+  ymin = -ymax;
+  xmin = ymin * aspect;
+  xmax = ymax * aspect;
+  glFrustum(xmin, xmax, ymin, ymax, zNear, zFar);
+}
+
+unsigned int OCLGLCommon::close(void) {
+  makeCurrent(hGL_);
+  unsigned int retVal = OCLTestImp::close();
+  deleteGLContext(hGL_);
+  return retVal;
+}
+
+void OCLGLCommon::dumpBuffer(float *pBuffer, const char fileName[],
+                             unsigned int dimSize) {
+  if (pBuffer) {
+    FILE *f = fopen(fileName, "w");
+    if (NULL != f) {
+      unsigned int i, j;
+      for (i = 0; i < dimSize; i++) {
+        for (j = 0; j < dimSize; j++) {
+          fprintf(f, "%e,\t", pBuffer[i * (dimSize) + j]);
+        }
+        fprintf(f, "\n");
+      }
+      fclose(f);
+    }
+  }
+}
+
+bool OCLGLCommon::createGLFragmentProgramFromSource(const char *source,
+                                                    GLuint &shader,
+                                                    GLuint &program) {
+  shader = glCreateShader(GL_FRAGMENT_SHADER);
+  glShaderSource(shader, 1, &source, NULL);
+  glCompileShader(shader);
+  printShaderInfoLog(shader);
+  program = glCreateProgram();
+  glAttachShader(program, shader);
+  glLinkProgram(program);
+  printProgramInfoLog(program);
+
+  return program != 0;
+}
+
+int OCLGLCommon::printOglError(char *file, int line) {
+  //
+  // Returns 1 if an OpenGL error occurred, 0 otherwise.
+  //
+  GLenum glErr;
+  int retCode = 0;
+
+  glErr = glGetError();
+  if (glErr != GL_NO_ERROR) {
+    printf("glError in file %s @ line %d: %d\n", file, line, glErr);
+    retCode = 1;
+  }
+  return retCode;
+}
+
+//
+// Print out the information log for a shader object
+//
+void OCLGLCommon::printShaderInfoLog(GLuint shader) {
+  int infologLength = 0;
+  int charsWritten = 0;
+  GLchar *infoLog;
+
+  glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &infologLength);
+
+  if (infologLength > 0) {
+    infoLog = (GLchar *)malloc(infologLength);
+    if (infoLog == NULL) {
+      printf("ERROR: Could not allocate InfoLog buffer\n");
+      return;
+    }
+    glGetShaderInfoLog(shader, infologLength, &charsWritten, infoLog);
+    printf("Shader InfoLog:\n%s\n\n", infoLog);
+    free(infoLog);
+  }
+}
+
+void OCLGLCommon::printProgramInfoLog(GLuint program) {
+  int infologLength = 0;
+  int charsWritten = 0;
+  GLchar *infoLog;
+
+  // printOpenGLError();  // Check for OpenGL errors
+
+  glGetProgramiv(program, GL_INFO_LOG_LENGTH, &infologLength);
+
+  // printOpenGLError();  // Check for OpenGL errors
+
+  if (infologLength > 0) {
+    infoLog = (GLchar *)malloc(infologLength);
+    if (infoLog == NULL) {
+      printf("ERROR: Could not allocate InfoLog buffer\n");
+      exit(1);
+    }
+    glGetProgramInfoLog(program, infologLength, &charsWritten, infoLog);
+    printf("Program InfoLog:\n%s\n\n", infoLog);
+    free(infoLog);
+  }
+  //  printOpenGLError();  // Check for OpenGL errors
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommon.h b/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommon.h
new file mode 100644
index 0000000000..003267952d
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommon.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GL_COMMON_H_
+#define _OCL_GL_COMMON_H_
+
+#include <GL/glew.h>
+#include <GL/gl.h>
+#include <GL/glx.h>
+
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+
+#include "OCLTestImp.h"
+
+typedef struct OCLGLHandle_* OCLGLHandle;
+
+#define printOpenGLError() OCLGLCommon::printOglError(__FILE__, __LINE__)
+
+class OCLGLCommon : public OCLTestImp {
+ public:
+  /////////////////////////////////////////
+  // private initialization and clean-up //
+  /////////////////////////////////////////
+  OCLGLCommon();
+  virtual ~OCLGLCommon();
+  ///////////////////////
+  // virtual interface //
+  ///////////////////////
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual unsigned int close(void);
+  static void gluPerspective(double fovy, double aspect, double zNear,
+                             double zFar);
+  static void dumpBuffer(float* pBuffer, const char fileName[],
+                         unsigned int dimSize);
+  static int printOglError(char* file, int line);
+  static bool createGLFragmentProgramFromSource(const char* source,
+                                                GLuint& shader,
+                                                GLuint& program);
+  static void printShaderInfoLog(GLuint shader);
+  static void printProgramInfoLog(GLuint program);
+
+ protected:
+  const OCLGLHandle getGLHandle() { return hGL_; }
+  void makeCurrent(const OCLGLHandle hGL);
+  void getCLContextPropertiesFromGLContext(const OCLGLHandle hGL,
+                                           cl_context_properties properties[7]);
+  bool createGLContext(OCLGLHandle& hGL);
+  void destroyGLContext(OCLGLHandle& hGL);
+  bool IsGLEnabled(unsigned int test, char* units, double& conversion,
+                   unsigned int deviceId);
+
+ private:
+  bool initializeGLContext(OCLGLHandle& hGL);
+  void deleteGLContext(OCLGLHandle& hGL);
+  bool checkAssociationDeviceWithGLContext(OCLGLHandle& hGL);
+  void createCLContextFromGLContext(OCLGLHandle& hGL);
+
+  OCLGLHandle hGL_;
+};
+
+#endif  // _OCL_GL_COMMON_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommonLinux.cpp b/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommonLinux.cpp
new file mode 100644
index 0000000000..4d445d1442
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommonLinux.cpp
@@ -0,0 +1,239 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLCommon.h"
+
+struct OCLGLHandle_ {
+  static Display* display;
+  static XVisualInfo* vInfo;
+  static int referenceCount;
+  GLXContext context;
+  Window window;
+  Colormap cmap;
+};
+
+Display* OCLGLHandle_::display = NULL;
+XVisualInfo* OCLGLHandle_::vInfo = NULL;
+int OCLGLHandle_::referenceCount = 0;
+
+OCLGLCommon::OCLGLCommon() {
+  hGL_ = new OCLGLHandle_;
+
+  hGL_->context = NULL;
+  hGL_->window = 0;
+  hGL_->cmap = 0;
+}
+
+OCLGLCommon::~OCLGLCommon() { destroyGLContext(hGL_); }
+
+void OCLGLCommon::destroyGLContext(OCLGLHandle& hGL) {
+  deleteGLContext(hGL);
+  delete hGL;
+  hGL = NULL;
+}
+
+void OCLGLCommon::deleteGLContext(OCLGLHandle& hGL) {
+  if (hGL->display != NULL) {
+    glXMakeCurrent(hGL->display, None, NULL);
+    if (hGL->cmap) {
+      XFreeColormap(hGL->display, hGL->cmap);
+      hGL->cmap = 0;
+    }
+    if (hGL->window) {
+      XDestroyWindow(hGL->display, hGL->window);
+      hGL->window = 0;
+    }
+    if (hGL->context) {
+      glXDestroyContext(hGL->display, hGL->context);
+      hGL->context = NULL;
+    }
+
+    hGL->referenceCount--;
+    if (hGL->referenceCount == 0) {
+      XCloseDisplay(hGL->display);
+      hGL->display = NULL;
+
+      XFree(hGL->vInfo);
+      hGL->vInfo = NULL;
+    }
+  }
+}
+
+bool OCLGLCommon::createGLContext(OCLGLHandle& hGL) {
+  hGL = new OCLGLHandle_;
+  return initializeGLContext(hGL);
+}
+
+bool OCLGLCommon::initializeGLContext(OCLGLHandle& hGL) {
+  if (hGL->display == NULL) {
+    hGL->display = XOpenDisplay(NULL);
+    if (hGL->display == NULL) {
+      printf("XOpenDisplay() failed\n");
+      return false;
+    }
+  }
+  if (hGL->vInfo == NULL) {
+    int dblBuf[] = {GLX_RGBA, GLX_RED_SIZE,     1,   GLX_GREEN_SIZE,
+                    1,        GLX_BLUE_SIZE,    1,   GLX_DEPTH_SIZE,
+                    12,       GLX_DOUBLEBUFFER, None};
+
+    hGL->vInfo =
+        glXChooseVisual(hGL->display, DefaultScreen(hGL->display), dblBuf);
+    if (hGL->vInfo == NULL) {
+      printf("glXChooseVisual() failed\n");
+      return false;
+    }
+  }
+  hGL->referenceCount++;
+
+  hGL->context = glXCreateContext(hGL->display, hGL->vInfo, None, True);
+  if (hGL->context == NULL) {
+    printf("glXCreateContext() failed\n");
+    return false;
+  }
+
+  XSetWindowAttributes swa = {0};
+  hGL->cmap = XCreateColormap(hGL->display,
+                              RootWindow(hGL->display, hGL->vInfo->screen),
+                              hGL->vInfo->visual, AllocNone);
+  swa.colormap = hGL->cmap;
+  hGL->window = XCreateWindow(
+      hGL->display, RootWindow(hGL->display, hGL->vInfo->screen), 0, 0, 640,
+      480, 0, hGL->vInfo->depth, InputOutput, hGL->vInfo->visual,
+      CWBorderPixel | CWColormap | CWEventMask, &swa);
+
+  Bool glErr = glXMakeCurrent(hGL->display, hGL->window, hGL->context);
+  if (False == glErr) {
+    return false;
+  }
+
+  if (!checkAssociationDeviceWithGLContext(hGL)) {
+    deleteGLContext(hGL);
+    return false;
+  }
+  return true;
+}
+
+bool OCLGLCommon::checkAssociationDeviceWithGLContext(OCLGLHandle& hGL) {
+  bool ret = false;
+  size_t devicesSize = 0;
+  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
+                                        (cl_context_properties)platform_,
+                                        CL_GL_CONTEXT_KHR,
+                                        (cl_context_properties)hGL->context,
+                                        CL_GLX_DISPLAY_KHR,
+                                        (cl_context_properties)hGL->display,
+                                        0};
+
+  error_ = _wrapper->clGetGLContextInfoKHR(
+      properties, CL_DEVICES_FOR_GL_CONTEXT_KHR, 0, NULL, &devicesSize);
+  if (error_ != CL_SUCCESS) {
+    printf("clGetGLContextInfoKHR failed (%d)\n", error_);
+    return false;
+  }
+
+  cl_uint numDevices = (cl_uint)devicesSize / sizeof(cl_device_id);
+  cl_device_id* interopDevices = (cl_device_id*)malloc(devicesSize);
+
+  error_ =
+      _wrapper->clGetGLContextInfoKHR(properties, CL_DEVICES_FOR_GL_CONTEXT_KHR,
+                                      devicesSize, interopDevices, NULL);
+  if (error_ != CL_SUCCESS) {
+    printf("clGetGLContextInfoKHR failed (%d)\n", error_);
+    free(interopDevices);
+    return false;
+  }
+
+  // Check that current device can be associated with OpenGL context
+  for (unsigned int i = 0; i < numDevices; i++) {
+    if (interopDevices[i] == devices_[_deviceId]) {
+      ret = true;
+      break;
+    }
+  }
+
+  free(interopDevices);
+  return ret;
+}
+
+void OCLGLCommon::createCLContextFromGLContext(OCLGLHandle& hGL) {
+  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
+                                        (cl_context_properties)platform_,
+                                        CL_GL_CONTEXT_KHR,
+                                        (cl_context_properties)hGL->context,
+                                        CL_GLX_DISPLAY_KHR,
+                                        (cl_context_properties)hGL->display,
+                                        0};
+
+  // Release current command queue
+  if (cmdQueues_[_deviceId]) {
+    error_ = _wrapper->clReleaseCommandQueue(cmdQueues_[_deviceId]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseCommandQueue() failed");
+  }
+
+  // Release current context
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseContext() failed");
+  }
+
+  // Create new CL context from GL context
+  context_ =
+      clCreateContext(properties, 1, &devices_[_deviceId], NULL, NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext() failed (%d)", error_);
+
+  // Create command queue for new context
+  cmdQueues_[_deviceId] =
+      _wrapper->clCreateCommandQueue(context_, devices_[_deviceId], 0, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed (%d)",
+               error_);
+
+  // GLEW versions 1.13.0 and earlier do not fetch all GL function pointers
+  // without glewExperimental set.
+  glewExperimental = GL_TRUE;
+  GLenum glErr = glewInit();
+  CHECK_RESULT((glErr != GLEW_OK), "glewInit() failed: %s",
+               glewGetErrorString(glErr));
+}
+
+void OCLGLCommon::makeCurrent(OCLGLHandle hGL) {
+  if (hGL == NULL) {
+    if (hGL_ != NULL) {
+      glXMakeCurrent(hGL_->display, None, NULL);
+    }
+  } else {
+    bool ret = glXMakeCurrent(hGL->display, hGL->window, hGL->context);
+    assert(ret && "glXMakeCurrent failed!");
+  }
+}
+
+void OCLGLCommon::getCLContextPropertiesFromGLContext(
+    const OCLGLHandle hGL, cl_context_properties properties[7]) {
+  if (!properties) return;
+
+  properties[0] = CL_CONTEXT_PLATFORM;
+  properties[1] = (cl_context_properties)platform_;
+  properties[2] = CL_GL_CONTEXT_KHR;
+  properties[3] = (cl_context_properties)hGL->context;
+  properties[4] = CL_GLX_DISPLAY_KHR;
+  properties[5] = (cl_context_properties)hGL->display;
+  properties[6] = 0;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommonWindows.cpp b/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommonWindows.cpp
new file mode 100644
index 0000000000..4a08bd6268
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommonWindows.cpp
@@ -0,0 +1,239 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLCommon.h"
+
+struct OCLGLHandle_ {
+  HDC hdc;
+  HGLRC hglrc;
+};
+
+OCLGLCommon::OCLGLCommon() {
+  hGL_ = new OCLGLHandle_;
+
+  hGL_->hdc = NULL;
+  hGL_->hglrc = NULL;
+}
+
+OCLGLCommon::~OCLGLCommon() { destroyGLContext(hGL_); }
+
+void OCLGLCommon::destroyGLContext(OCLGLHandle& hGL) {
+  deleteGLContext(hGL);
+  delete hGL;
+  hGL = NULL;
+}
+
+void OCLGLCommon::deleteGLContext(OCLGLHandle& hGL) {
+  wglMakeCurrent(NULL, NULL);
+  if (hGL->hglrc) {
+    wglDeleteContext(hGL->hglrc);
+    hGL->hglrc = NULL;
+  }
+  if (hGL->hdc) {
+    DeleteDC(hGL->hdc);
+    hGL->hdc = NULL;
+  }
+}
+
+bool OCLGLCommon::createGLContext(OCLGLHandle& hGL) {
+  hGL = new OCLGLHandle_;
+  return initializeGLContext(hGL);
+}
+
+bool OCLGLCommon::initializeGLContext(OCLGLHandle& hGL) {
+  BOOL glErr = FALSE;
+  DISPLAY_DEVICE dispDevice;
+  DWORD deviceNum;
+  int pfmt;
+  PIXELFORMATDESCRIPTOR pfd;
+  pfd.nSize = sizeof(PIXELFORMATDESCRIPTOR);
+  pfd.nVersion = 1;
+  pfd.dwFlags = PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER;
+  pfd.iPixelType = PFD_TYPE_RGBA;
+  pfd.cColorBits = 24;
+  pfd.cRedBits = 8;
+  pfd.cRedShift = 0;
+  pfd.cGreenBits = 8;
+  pfd.cGreenShift = 0;
+  pfd.cBlueBits = 8;
+  pfd.cBlueShift = 0;
+  pfd.cAlphaBits = 8;
+  pfd.cAlphaShift = 0;
+  pfd.cAccumBits = 0;
+  pfd.cAccumRedBits = 0;
+  pfd.cAccumGreenBits = 0;
+  pfd.cAccumBlueBits = 0;
+  pfd.cAccumAlphaBits = 0;
+  pfd.cDepthBits = 24;
+  pfd.cStencilBits = 8;
+  pfd.cAuxBuffers = 0;
+  pfd.iLayerType = PFD_MAIN_PLANE;
+  pfd.bReserved = 0;
+  pfd.dwLayerMask = 0;
+  pfd.dwVisibleMask = 0;
+  pfd.dwDamageMask = 0;
+
+  dispDevice.cb = sizeof(DISPLAY_DEVICE);
+  for (deviceNum = 0; EnumDisplayDevices(NULL, deviceNum, &dispDevice, 0);
+       deviceNum++) {
+    if (dispDevice.StateFlags & DISPLAY_DEVICE_MIRRORING_DRIVER) {
+      continue;
+    }
+
+    hGL->hdc = CreateDC(NULL, dispDevice.DeviceName, NULL, NULL);
+    if (!hGL->hdc) {
+      continue;
+    }
+
+    pfmt = ChoosePixelFormat(hGL->hdc, &pfd);
+    if (pfmt == 0) {
+      printf("Failed choosing the requested PixelFormat.\n");
+      return false;
+    }
+
+    glErr = SetPixelFormat(hGL->hdc, pfmt, &pfd);
+    if (glErr == FALSE) {
+      printf("Failed to set the requested PixelFormat.\n");
+      return false;
+    }
+
+    hGL->hglrc = wglCreateContext(hGL->hdc);
+    if (NULL == hGL->hglrc) {
+      printf("wglCreateContext() failed\n");
+      return false;
+    }
+
+    glErr = wglMakeCurrent(hGL->hdc, hGL->hglrc);
+    if (FALSE == glErr) {
+      printf("wglMakeCurrent() failed\n");
+      return false;
+    }
+
+    if (!checkAssociationDeviceWithGLContext(hGL)) {
+      deleteGLContext(hGL);
+      return false;
+    }
+
+    return true;
+  }  //  for (deviceNum = 0; EnumDisplayDevices(NULL, deviceNum, &dispDevice,
+     //  0); deviceNum++) {
+
+  return false;
+}
+
+bool OCLGLCommon::checkAssociationDeviceWithGLContext(OCLGLHandle& hGL) {
+  bool ret = false;
+  size_t devicesSize = 0;
+  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
+                                        (cl_context_properties)platform_,
+                                        CL_GL_CONTEXT_KHR,
+                                        (cl_context_properties)hGL->hglrc,
+                                        CL_WGL_HDC_KHR,
+                                        (cl_context_properties)hGL->hdc,
+                                        0};
+
+  error_ = _wrapper->clGetGLContextInfoKHR(
+      properties, CL_DEVICES_FOR_GL_CONTEXT_KHR, 0, NULL, &devicesSize);
+  if (error_ != CL_SUCCESS) {
+    printf("clGetGLContextInfoKHR failed (%d)\n", error_);
+    return false;
+  }
+
+  cl_uint numDevices = (cl_uint)devicesSize / sizeof(cl_device_id);
+  cl_device_id* interopDevices = (cl_device_id*)malloc(devicesSize);
+
+  error_ =
+      _wrapper->clGetGLContextInfoKHR(properties, CL_DEVICES_FOR_GL_CONTEXT_KHR,
+                                      devicesSize, interopDevices, NULL);
+  if (error_ != CL_SUCCESS) {
+    printf("clGetGLContextInfoKHR failed (%d)\n", error_);
+    free(interopDevices);
+    return false;
+  }
+
+  // Check that current device can be associated with OpenGL context
+  for (unsigned int i = 0; i < numDevices; i++) {
+    if (interopDevices[i] == devices_[_deviceId]) {
+      ret = true;
+      break;
+    }
+  }
+
+  free(interopDevices);
+  return ret;
+}
+
+void OCLGLCommon::createCLContextFromGLContext(OCLGLHandle& hGL) {
+  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
+                                        (cl_context_properties)platform_,
+                                        CL_GL_CONTEXT_KHR,
+                                        (cl_context_properties)hGL->hglrc,
+                                        CL_WGL_HDC_KHR,
+                                        (cl_context_properties)hGL->hdc,
+                                        0};
+
+  // Release current command queue
+  if (cmdQueues_[_deviceId]) {
+    error_ = _wrapper->clReleaseCommandQueue(cmdQueues_[_deviceId]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseCommandQueue() failed");
+  }
+
+  // Release current context
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseContext() failed");
+  }
+
+  // Create new CL context from GL context
+  context_ =
+      clCreateContext(properties, 1, &devices_[_deviceId], NULL, NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext() failed (%d)", error_);
+
+  // Create command queue for new context
+  cmdQueues_[_deviceId] =
+      _wrapper->clCreateCommandQueue(context_, devices_[_deviceId], 0, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed (%d)",
+               error_);
+
+  GLenum glErr = glewInit();
+  CHECK_RESULT((glErr != GLEW_OK), "glewInit() failed");
+}
+
+void OCLGLCommon::makeCurrent(OCLGLHandle hGL) {
+  if (hGL == NULL) {
+    wglMakeCurrent(NULL, NULL);
+  } else {
+    wglMakeCurrent(hGL->hdc, hGL->hglrc);
+  }
+}
+
+void OCLGLCommon::getCLContextPropertiesFromGLContext(
+    const OCLGLHandle hGL, cl_context_properties properties[7]) {
+  if (!properties) return;
+
+  properties[0] = CL_CONTEXT_PLATFORM;
+  properties[1] = (cl_context_properties)platform_;
+  properties[2] = CL_GL_CONTEXT_KHR;
+  properties[3] = (cl_context_properties)hGL->hglrc;
+  properties[4] = CL_WGL_HDC_KHR;
+  properties[5] = (cl_context_properties)hGL->hdc;
+  properties[6] = 0;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/common/OCLTestImp.cpp b/projects/clr/opencl/tests/ocltst/module/common/OCLTestImp.cpp
new file mode 100644
index 0000000000..70d33ecb10
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/common/OCLTestImp.cpp
@@ -0,0 +1,288 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLTestImp.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cstdio>
+#include <cstring>
+
+/////////////////////////////////////////////////////////////////////////////
+
+static unsigned int crcinit(unsigned int crc);
+static int initializeSeed(void);
+
+/////////////////////////////////////////////////////////////////////////////
+
+OCLutil::Lock OCLTestImp::openDeviceLock;
+OCLutil::Lock OCLTestImp::compileLock;
+
+OCLTestImp::OCLTestImp()
+    : _wrapper(0),
+      _seed(0),
+      error_(0),
+      type_(0),
+      deviceCount_(0),
+      devices_(0),
+      platform_(0),
+      context_(0),
+      program_(0),
+      kernel_(0) {
+  unsigned int i;
+  for (i = 0; i < 256; i++) {
+    _crctab[i] = crcinit(i << 24);
+  }
+  _perfInfo = 0;
+
+  _wrapper = 0;
+  _iterationCnt = 0;
+
+  _seed = initializeSeed();
+
+  _errorMsg = "";
+  _errorFlag = false;
+  type_ = CL_DEVICE_TYPE_GPU;
+}
+
+OCLTestImp::~OCLTestImp() {}
+void OCLTestImp::useCPU() { type_ = CL_DEVICE_TYPE_CPU; }
+void OCLTestImp::open(unsigned int test, char* units, double& conversion,
+                      unsigned int deviceId) {
+  devices_ = 0;
+  context_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  deviceCount_ = 0;
+
+  open(test, units, conversion, deviceId, getPlatformIndex());
+}
+void OCLTestImp::open(unsigned int test, char* units, double& conversion,
+                      unsigned int deviceId, unsigned int platformIndex) {
+  BaseTestImp::open();
+  devices_ = 0;
+  deviceCount_ = 0;
+  context_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  _deviceId = deviceId;
+  _platformIndex = platformIndex;
+
+  cl_uint numPlatforms = 0;
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clGetPlatformIDs failed");
+  CHECK_RESULT((numPlatforms == 0), "No platform found");
+
+  cl_platform_id* platforms = new cl_platform_id[numPlatforms];
+  error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+
+  cl_platform_id platform = 0;
+#if 0
+  for(unsigned int i = 0; i < numPlatforms; ++i)
+  {
+    char buff[200];
+    error_ = _wrapper->clGetPlatformInfo(platforms[i],CL_PLATFORM_VENDOR, sizeof(buff), buff, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+    if(strcmp(buff, "Advanced Micro Devices, Inc.") == 0)
+    {
+      platform = platforms[i];
+      break;
+    }
+  }
+#endif
+  platform = platforms[_platformIndex];
+
+  delete[] platforms;
+
+  CHECK_RESULT((platform == 0), "AMD Platform not found");
+
+  error_ = _wrapper->clGetDeviceIDs(platform, type_, 0, NULL, &deviceCount_);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed");
+
+  devices_ = new cl_device_id[deviceCount_];
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, deviceCount_, devices_, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed");
+
+  cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)platform, 0};
+  context_ = _wrapper->clCreateContext(props, deviceCount_, devices_, NULL, 0,
+                                       &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext failed");
+
+  cl_command_queue cmdQueue;
+  for (unsigned int i = 0; i < deviceCount_; ++i) {
+#ifndef CL_VERSION_2_0
+    cmdQueue = _wrapper->clCreateCommandQueue(
+        context_, devices_[i], CL_QUEUE_PROFILING_ENABLE, &error_);
+#else
+    cl_queue_properties prop[] = {CL_QUEUE_PROPERTIES,
+                                  CL_QUEUE_PROFILING_ENABLE, 0};
+    cmdQueue = _wrapper->clCreateCommandQueueWithProperties(
+        context_, devices_[i], prop, &error_);
+#endif
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed");
+    cmdQueues_.push_back(cmdQueue);
+  }
+  platform_ = platform;
+}
+
+unsigned int OCLTestImp::close() {
+  for (unsigned int i = 0; i < buffers().size(); ++i) {
+    error_ = _wrapper->clReleaseMemObject(buffers()[i]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseMemObject() failed");
+  }
+  buffers_.clear();
+
+  if (kernel_ != 0) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseKernel() failed");
+  }
+
+  if (program_ != 0) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseProgram() failed");
+  }
+
+  for (unsigned int i = 0; i < cmdQueues_.size(); ++i) {
+    error_ = _wrapper->clReleaseCommandQueue(cmdQueues_[i]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseCommandQueue() failed");
+  }
+  cmdQueues_.clear();
+
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseContext() failed");
+  }
+
+  if (devices_) {
+    delete[] devices_;
+  }
+
+  return BaseTestImp::close();
+}
+
+int OCLTestImp::genBitRand(int n) {
+  int rslt;
+  if (n <= 0 || n > 32) {
+    assert(0);
+    rslt = 0;
+  } else if (n < 32) {
+    _seed = _seed * 1103515245 + 12345;
+    /*
+     * return the most-significant n bits; they are the random ones (see
+     * Knuth, Vol 2)
+     */
+    rslt = (_seed & 0x7fffffff) >> (31 - n);
+  } else {
+    rslt = (genBitRand(16) << 16) | genBitRand(16);
+  }
+
+  return rslt;
+}
+
+int OCLTestImp::genIntRand(int a, int b) {
+  int r;
+  int sign = 1;
+  int mySmall;
+  int delta;
+  int bits = 0;
+  int rslt;
+  if (a > b) {
+    mySmall = b;
+    delta = a - b;
+  } else {
+    mySmall = a;
+    delta = b - a;
+  }
+  if (delta == 0) {
+    rslt = a;
+    return (rslt);
+  } else if (delta < 0) {
+    sign = -1;
+    delta = -delta;
+  }
+  delta &= 0x7fffffff;
+  for (r = delta; r > 0; r >>= 1) {
+    bits++;
+  }
+  do {
+    r = genBitRand(bits);
+  } while (r > delta);
+
+  rslt = mySmall + r * sign;
+
+  return (rslt);
+}
+
+void OCLTestImp::setOCLWrapper(OCLWrapper* wrapper) { _wrapper = wrapper; }
+
+/////////////////////////////////////////////////////////////////////////////
+
+#ifdef ATI_OS_WIN
+
+#include <windows.h>
+
+static int initializeSeed(void) {
+  __int64 val;
+  QueryPerformanceCounter((LARGE_INTEGER*)&val);
+  return (int)val;
+}
+
+#endif  // ATI_OS_WIN
+
+/////////////////////////////////////////////////////////////////////////////
+
+#ifdef ATI_OS_LINUX
+
+#include <sys/time.h>
+
+static int initializeSeed(void) {
+  struct timeval t;
+  gettimeofday(&t, 0);
+  return (int)t.tv_usec;
+}
+
+#endif  // ATI_OS_LINUX
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Same CRC32 as used by ogtst
+//
+static const unsigned int CRCMASK = 0x04c11db7;
+
+static unsigned int crcinit(unsigned int crc) {
+  int i;
+  unsigned int ans = crc;
+
+  for (i = 0; i < 8; i++) {
+    if (ans & 0x80000000) {
+      ans = (ans << 1) ^ CRCMASK;
+    } else {
+      ans <<= 1;
+    }
+  }
+  return (ans);
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/common/OCLTestListImp.cpp b/projects/clr/opencl/tests/ocltst/module/common/OCLTestListImp.cpp
new file mode 100644
index 0000000000..4398652904
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/common/OCLTestListImp.cpp
@@ -0,0 +1,70 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLTestListImp.h"
+
+#include <stdlib.h>
+
+#include "OCLTest.h"
+
+//
+//  OCLTestList_TestCount - retrieve the number of tests in the testing module
+//
+unsigned int OCL_CALLCONV OCLTestList_TestCount(void) { return TestListCount; }
+
+//
+//  OCLTestList_TestLibVersion - retrieve the version of test lib in the testing
+//  module
+//
+unsigned int OCL_CALLCONV OCLTestList_TestLibVersion(void) {
+  return TestLibVersion;
+}
+
+//
+//  OCLTestList_TestLibName - retrieve the name of test library
+//
+const char* OCL_CALLCONV OCLTestList_TestLibName(void) { return TestLibName; }
+
+//
+//  OCLTestList_TestName - retrieve the name of the indexed test in the module
+//
+const char* OCL_CALLCONV OCLTestList_TestName(unsigned int testNum) {
+  if (testNum >= OCLTestList_TestCount()) {
+    return NULL;
+  }
+
+  return TestList[testNum].name;
+}
+
+//
+//  OCLTestList_CreateTest - create a test by index
+//
+OCLTest* OCL_CALLCONV OCLTestList_CreateTest(unsigned int testNum) {
+  if (testNum >= OCLTestList_TestCount()) {
+    return NULL;
+  }
+
+  return reinterpret_cast<OCLTest*>((*TestList[testNum].create)());
+}
+
+//
+//  OCLTestList_DestroyTest - destroy a test object
+//
+void OCL_CALLCONV OCLTestList_DestroyTest(OCLTest* test) { delete test; }
diff --git a/projects/clr/opencl/tests/ocltst/module/common/OCLTestUtils.cpp b/projects/clr/opencl/tests/ocltst/module/common/OCLTestUtils.cpp
new file mode 100644
index 0000000000..e5b341956c
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/common/OCLTestUtils.cpp
@@ -0,0 +1,46 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLTestUtils.h"
+
+#include <fstream>
+#include <iostream>
+
+bool loadFile(const char* filename, std::string& s) {
+  size_t size;
+  char* str;
+  std::fstream f(filename, std::fstream::in | std::fstream::binary);
+
+  if (f.is_open()) {
+    size_t fileSize;
+    f.seekg(0, std::fstream::end);
+    size = fileSize = (size_t)f.tellg();
+    f.seekg(0, std::fstream::beg);
+    str = new char[size + 1];
+    f.read(str, fileSize);
+    f.close();
+    str[size] = '\0';
+    s = str;
+    delete[] str;
+    return true;
+  }
+  std::cerr << "Error: failed to open file: " << filename << '\n';
+  return false;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/common/OCLThread.cpp b/projects/clr/opencl/tests/ocltst/module/common/OCLThread.cpp
new file mode 100644
index 0000000000..051f565b9d
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/common/OCLThread.cpp
@@ -0,0 +1,209 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+//!
+//! \file OCLThread.cpp
+//!
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "OCL/Thread.h"
+#ifdef ATI_OS_WIN
+#include <process.h>
+#endif
+
+//! pack the function pointer and data inside this struct
+typedef struct __argsToThreadFunc {
+  oclThreadFunc func;
+  void *data;
+
+} argsToThreadFunc;
+
+#ifdef ATI_OS_WIN
+//! Windows thread callback - invokes the callback set by
+//! the application in OCLThread constructor
+unsigned _stdcall win32ThreadFunc(void *args) {
+  argsToThreadFunc *ptr = (argsToThreadFunc *)args;
+  OCLutil::Thread *obj = (OCLutil::Thread *)ptr->data;
+  ptr->func(obj->getData());
+  delete args;
+  return 0;
+}
+#endif
+
+////////////////////////////////////////////////////////////////////
+//!
+//! Constructor for OCLLock
+//!
+OCLutil::Lock::Lock() {
+#ifdef ATI_OS_WIN
+  InitializeCriticalSection(&_cs);
+#else
+  pthread_mutex_init(&_lock, NULL);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////
+//!
+//! Destructor for OCLLock
+//!
+OCLutil::Lock::~Lock() {
+#ifdef ATI_OS_WIN
+  DeleteCriticalSection(&_cs);
+#else
+  pthread_mutex_destroy(&_lock);
+#endif
+}
+
+//////////////////////////////////////////////////////////////
+//!
+//! Try to acquire the lock, wait for the lock if unavailable
+//! else hold the lock and enter the protected area
+//!
+void OCLutil::Lock::lock() {
+#ifdef ATI_OS_WIN
+  EnterCriticalSection(&_cs);
+#else
+  pthread_mutex_lock(&_lock);
+#endif
+}
+
+//////////////////////////////////////////////////////////////
+//!
+//! Try to acquire the lock, if unavailable the function returns
+//! false and returns true if available(enters the critical
+//! section as well in this case).
+//!
+bool OCLutil::Lock::tryLock() {
+#ifdef ATI_OS_WIN
+  return (TryEnterCriticalSection(&_cs) != 0);
+#else
+  return !((bool)pthread_mutex_trylock(&_lock));
+#endif
+}
+
+//////////////////////////////////////////////////////////////
+//!
+//! Unlock the lock
+//!
+void OCLutil::Lock::unlock() {
+#ifdef ATI_OS_WIN
+  LeaveCriticalSection(&_cs);
+#else
+  pthread_mutex_unlock(&_lock);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////
+//!
+//! Constructor for OCLThread
+//!
+OCLutil::Thread::Thread() : _tid(0), _data(0) {
+#ifdef ATI_OS_WIN
+  _ID = 0;
+#else
+#endif
+}
+
+////////////////////////////////////////////////////////////////////
+//!
+//! Destructor for OCLLock
+//!
+OCLutil::Thread::~Thread() {
+#ifdef ATI_OS_WIN
+  CloseHandle(_tid);
+#else
+#endif
+}
+
+//////////////////////////////////////////////////////////////
+//!
+//! Create a new thread and return the status of the operation
+//!
+bool OCLutil::Thread::create(oclThreadFunc func, void *arg) {
+  // Save the data internally
+  _data = arg;
+
+  unsigned int retVal;
+
+  bool verbose = getenv("VERBOSE") != NULL;
+
+#ifdef ATI_OS_WIN
+  // Setup the callback struct for thread function and pass to the
+  // begin thread routine
+  // xxx The following struct is allocated but never freed!!!!
+  argsToThreadFunc *args = new argsToThreadFunc;
+  args->func = func;
+  args->data = this;
+
+  _tid = (HANDLE)_beginthreadex(NULL, 0, win32ThreadFunc, args, 0, &retVal);
+
+  if (verbose) {
+    printf("Thread handle value = %p\n", _tid);
+
+    printf("Done creating thread. Thread id value = %u\n", retVal);
+  }
+#else
+  //! Now create the thread with pointer to self as the data
+  retVal = pthread_create(&_tid, NULL, func, arg);
+
+  if (verbose)
+    printf("Done creating thread. Ret value %d, Self = %u\n", retVal,
+           (unsigned int)pthread_self());
+#endif
+
+  if (retVal != 0) return false;
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////
+//!
+//! Return the thread ID for the current OCLThread
+//!
+unsigned int OCLutil::Thread::getID() {
+#ifdef ATI_OS_WIN
+  return GetCurrentThreadId();
+  // Type cast the thread handle to unsigned in and send it over
+#else
+  return (unsigned int)pthread_self();
+#endif
+}
+
+//////////////////////////////////////////////////////////////
+//!
+//! Wait for this thread to join
+//!
+bool OCLutil::Thread::join() {
+#ifdef ATI_OS_WIN
+  DWORD rc = WaitForSingleObject(_tid, INFINITE);
+
+  if (rc == WAIT_FAILED) {
+    printf("Bad call to function(invalid handle?)\n");
+  }
+#else
+  int rc = pthread_join(_tid, NULL);
+#endif
+
+  if (rc != 0) return false;
+
+  return true;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/common/OCLWrapper.cpp b/projects/clr/opencl/tests/ocltst/module/common/OCLWrapper.cpp
new file mode 100644
index 0000000000..f78fd73287
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/common/OCLWrapper.cpp
@@ -0,0 +1,944 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLWrapper.h"
+
+OCLWrapper::OCLWrapper() {
+  clEnqueueWaitSignalAMD_ptr =
+      (clEnqueueWaitSignalAMD_fn)clGetExtensionFunctionAddress(
+          "clEnqueueWaitSignalAMD");
+  clEnqueueWriteSignalAMD_ptr =
+      (clEnqueueWriteSignalAMD_fn)clGetExtensionFunctionAddress(
+          "clEnqueueWriteSignalAMD");
+  clEnqueueMakeBuffersResidentAMD_ptr =
+      (clEnqueueMakeBuffersResidentAMD_fn)clGetExtensionFunctionAddress(
+          "clEnqueueMakeBuffersResidentAMD");
+
+  clUnloadPlatformAMD_ptr =
+      (clUnloadPlatformAMD_fn)clGetExtensionFunctionAddress(
+          "clUnloadPlatformAMD");
+
+  // CL-GL function pointers
+  clGetGLContextInfoKHR_ptr =
+      (clGetGLContextInfoKHR_fn)clGetExtensionFunctionAddress(
+          "clGetGLContextInfoKHR");
+  clCreateFromGLBuffer_ptr =
+      (clCreateFromGLBuffer_fn)clGetExtensionFunctionAddress(
+          "clCreateFromGLBuffer");
+  clCreateFromGLTexture_ptr =
+      (clCreateFromGLTexture_fn)clGetExtensionFunctionAddress(
+          "clCreateFromGLTexture");
+  clCreateFromGLTexture2D_ptr =
+      (clCreateFromGLTexture2D_fn)clGetExtensionFunctionAddress(
+          "clCreateFromGLTexture2D");
+  clCreateFromGLRenderbuffer_ptr =
+      (clCreateFromGLRenderbuffer_fn)clGetExtensionFunctionAddress(
+          "clCreateFromGLRenderbuffer");
+  clGetGLObjectInfo_ptr =
+      (clGetGLObjectInfo_fn)clGetExtensionFunctionAddress("clGetGLObjectInfo");
+  clGetGLTextureInfo_ptr = (clGetGLTextureInfo_fn)clGetExtensionFunctionAddress(
+      "clGetGLTextureInfo");
+  clEnqueueAcquireGLObjects_ptr =
+      (clEnqueueAcquireGLObjects_fn)clGetExtensionFunctionAddress(
+          "clEnqueueAcquireGLObjects");
+  clEnqueueReleaseGLObjects_ptr =
+      (clEnqueueReleaseGLObjects_fn)clGetExtensionFunctionAddress(
+          "clEnqueueReleaseGLObjects");
+
+  // Performance counter function pointers
+  clCreatePerfCounterAMD_ptr =
+      (clCreatePerfCounterAMD_fn)clGetExtensionFunctionAddress(
+          "clCreatePerfCounterAMD");
+  clEnqueueBeginPerfCounterAMD_ptr =
+      (clEnqueueBeginPerfCounterAMD_fn)clGetExtensionFunctionAddress(
+          "clEnqueueBeginPerfCounterAMD");
+  clEnqueueEndPerfCounterAMD_ptr =
+      (clEnqueueEndPerfCounterAMD_fn)clGetExtensionFunctionAddress(
+          "clEnqueueEndPerfCounterAMD");
+  clGetPerfCounterInfoAMD_ptr =
+      (clGetPerfCounterInfoAMD_fn)clGetExtensionFunctionAddress(
+          "clGetPerfCounterInfoAMD");
+  clReleasePerfCounterAMD_ptr =
+      (clReleasePerfCounterAMD_fn)clGetExtensionFunctionAddress(
+          "clReleasePerfCounterAMD");
+  clRetainPerfCounterAMD_ptr =
+      (clRetainPerfCounterAMD_fn)clGetExtensionFunctionAddress(
+          "clRetainPerfCounterAMD");
+  clSetDeviceClockModeAMD_ptr =
+      (clSetDeviceClockModeAMD_fn)clGetExtensionFunctionAddress(
+          "clSetDeviceClockModeAMD");
+}
+
+cl_int OCLWrapper::clGetPlatformIDs(cl_uint num_entries,
+                                    cl_platform_id *platforms,
+                                    cl_uint *num_platforms) {
+  return ::clGetPlatformIDs(num_entries, platforms, num_platforms);
+}
+
+cl_int OCLWrapper::clGetPlatformInfo(cl_platform_id platform,
+                                     cl_platform_info param_name,
+                                     size_t param_value_size, void *param_value,
+                                     size_t *param_value_size_ret) {
+  return ::clGetPlatformInfo(platform, param_name, param_value_size,
+                             param_value, param_value_size_ret);
+}
+
+cl_int OCLWrapper::clGetDeviceIDs(cl_platform_id platform,
+                                  cl_device_type device_type,
+                                  cl_uint num_entries, cl_device_id *devices,
+                                  cl_uint *num_devices) {
+  return ::clGetDeviceIDs(platform, device_type, num_entries, devices,
+                          num_devices);
+}
+
+cl_int OCLWrapper::clGetDeviceInfo(cl_device_id device,
+                                   cl_device_info param_name,
+                                   size_t param_value_size, void *param_value,
+                                   size_t *param_value_size_ret) {
+  return ::clGetDeviceInfo(device, param_name, param_value_size, param_value,
+                           param_value_size_ret);
+}
+
+cl_context OCLWrapper::clCreateContext(
+    cl_context_properties *properties, cl_uint num_devices,
+    const cl_device_id *devices,
+    void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+    void *user_data, cl_int *errcode_ret) {
+  return ::clCreateContext(properties, num_devices, devices, pfn_notify,
+                           user_data, errcode_ret);
+}
+
+cl_context OCLWrapper::clCreateContextFromType(
+    cl_context_properties *properties, cl_device_type device_type,
+    void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+    void *user_data, cl_int *errcode_ret) {
+  return ::clCreateContextFromType(properties, device_type, pfn_notify,
+                                   user_data, errcode_ret);
+}
+
+cl_int OCLWrapper::clRetainContext(cl_context context) {
+  return ::clRetainContext(context);
+}
+
+cl_int OCLWrapper::clReleaseContext(cl_context context) {
+  return ::clReleaseContext(context);
+}
+
+cl_int OCLWrapper::clGetContextInfo(cl_context context,
+                                    cl_context_info param_name,
+                                    size_t param_value_size, void *param_value,
+                                    size_t *param_value_size_ret) {
+  return ::clGetContextInfo(context, param_name, param_value_size, param_value,
+                            param_value_size_ret);
+}
+
+cl_command_queue OCLWrapper::clCreateCommandQueue(
+    cl_context context, cl_device_id device,
+    cl_command_queue_properties properties, cl_int *errcode_ret) {
+#if defined(CL_VERSION_2_0)
+  cl_int err;
+  cl_platform_id pid;
+  bool version20 = true;
+  err = ::clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id),
+                          &pid, NULL);
+  if (err == CL_SUCCESS) {
+    size_t size;
+    char *ver;
+    err = ::clGetPlatformInfo(pid, CL_PLATFORM_VERSION, 0, NULL, &size);
+    if (err == CL_SUCCESS) {
+      ver = new char[size];
+      if (ver) {
+        err = ::clGetPlatformInfo(pid, CL_PLATFORM_VERSION, size, ver, NULL);
+        if (err == CL_SUCCESS) {
+          if (ver[8] == '1') {
+            version20 = false;
+          }
+        }
+        delete[] ver;
+      }
+    }
+  }
+  if (version20) {
+    const cl_queue_properties cprops[] = {
+        CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0};
+    return ::clCreateCommandQueueWithProperties(
+        context, device, properties ? cprops : NULL, errcode_ret);
+  } else {
+    return ::clCreateCommandQueue(context, device, properties, errcode_ret);
+  }
+#else
+  return ::clCreateCommandQueue(context, device, properties, errcode_ret);
+#endif
+}
+
+cl_int OCLWrapper::clRetainCommandQueue(cl_command_queue command_queue) {
+  return ::clRetainCommandQueue(command_queue);
+}
+
+cl_int OCLWrapper::clReleaseCommandQueue(cl_command_queue command_queue) {
+  return ::clReleaseCommandQueue(command_queue);
+}
+
+cl_int OCLWrapper::clGetCommandQueueInfo(cl_command_queue command_queue,
+                                         cl_command_queue_info param_name,
+                                         size_t param_value_size,
+                                         void *param_value,
+                                         size_t *param_value_size_ret) {
+  return ::clGetCommandQueueInfo(command_queue, param_name, param_value_size,
+                                 param_value, param_value_size_ret);
+}
+
+cl_mem OCLWrapper::clCreateBuffer(cl_context context, cl_mem_flags flags,
+                                  size_t size, void *host_ptr,
+                                  cl_int *errcode_ret) {
+  return ::clCreateBuffer(context, flags, size, host_ptr, errcode_ret);
+}
+
+cl_mem OCLWrapper::clCreateImage2D(cl_context context, cl_mem_flags flags,
+                                   const cl_image_format *image_format,
+                                   size_t image_width, size_t image_height,
+                                   size_t image_row_pitch, void *host_ptr,
+                                   cl_int *errcode_ret) {
+  return ::clCreateImage2D(context, flags, image_format, image_width,
+                           image_height, image_row_pitch, host_ptr,
+                           errcode_ret);
+}
+
+cl_mem OCLWrapper::clCreateImage3D(cl_context context, cl_mem_flags flags,
+                                   const cl_image_format *image_format,
+                                   size_t image_width, size_t image_height,
+                                   size_t image_depth, size_t image_row_pitch,
+                                   size_t image_slice_pitch, void *host_ptr,
+                                   cl_int *errcode_ret) {
+  return ::clCreateImage3D(context, flags, image_format, image_width,
+                           image_height, image_depth, image_row_pitch,
+                           image_slice_pitch, host_ptr, errcode_ret);
+}
+
+cl_int OCLWrapper::clRetainMemObject(cl_mem memobj) {
+  return ::clRetainMemObject(memobj);
+}
+
+cl_int OCLWrapper::clReleaseMemObject(cl_mem memobj) {
+  return ::clReleaseMemObject(memobj);
+}
+
+cl_int OCLWrapper::clGetSupportedImageFormats(cl_context context,
+                                              cl_mem_flags flags,
+                                              cl_mem_object_type image_type,
+                                              cl_uint num_entries,
+                                              cl_image_format *image_formats,
+                                              cl_uint *num_image_formats) {
+  return ::clGetSupportedImageFormats(context, flags, image_type, num_entries,
+                                      image_formats, num_image_formats);
+}
+
+cl_int OCLWrapper::clGetMemObjectInfo(cl_mem memobj, cl_mem_info param_name,
+                                      size_t param_value_size,
+                                      void *param_value,
+                                      size_t *param_value_size_ret) {
+  return ::clGetMemObjectInfo(memobj, param_name, param_value_size, param_value,
+                              param_value_size_ret);
+}
+
+cl_int OCLWrapper::clGetImageInfo(cl_mem image, cl_image_info param_name,
+                                  size_t param_value_size, void *param_value,
+                                  size_t *param_value_size_ret) {
+  return ::clGetImageInfo(image, param_name, param_value_size, param_value,
+                          param_value_size_ret);
+}
+
+cl_sampler OCLWrapper::clCreateSampler(cl_context context,
+                                       cl_bool normalized_coords,
+                                       cl_addressing_mode addressing_mode,
+                                       cl_filter_mode filter_mode,
+                                       cl_int *errcode_ret) {
+#ifdef CL_VERSION_2_0
+  const cl_sampler_properties sprops[] = {
+      CL_SAMPLER_NORMALIZED_COORDS,
+      static_cast<cl_sampler_properties>(normalized_coords),
+      CL_SAMPLER_ADDRESSING_MODE,
+      static_cast<cl_sampler_properties>(addressing_mode),
+      CL_SAMPLER_FILTER_MODE,
+      static_cast<cl_sampler_properties>(filter_mode),
+      0};
+  return ::clCreateSamplerWithProperties(context, sprops, errcode_ret);
+#else
+  return ::clCreateSampler(context, normalized_coords, addressing_mode,
+                           filter_mode, errcode_ret);
+#endif
+}
+
+cl_int OCLWrapper::clRetainSampler(cl_sampler sampler) {
+  return ::clRetainSampler(sampler);
+}
+
+cl_int OCLWrapper::clReleaseSampler(cl_sampler sampler) {
+  return ::clReleaseSampler(sampler);
+}
+
+cl_int OCLWrapper::clGetSamplerInfo(cl_sampler sampler,
+                                    cl_sampler_info param_name,
+                                    size_t param_value_size, void *param_value,
+                                    size_t *param_value_size_ret) {
+  return ::clGetSamplerInfo(sampler, param_name, param_value_size, param_value,
+                            param_value_size_ret);
+}
+
+cl_program OCLWrapper::clCreateProgramWithSource(cl_context context,
+                                                 cl_uint count,
+                                                 const char **strings,
+                                                 const size_t *lengths,
+                                                 cl_int *errcode_ret) {
+  return ::clCreateProgramWithSource(context, count, strings, lengths,
+                                     errcode_ret);
+}
+
+cl_program OCLWrapper::clCreateProgramWithBinary(
+    cl_context context, cl_uint num_devices, const cl_device_id *device_list,
+    const size_t *lengths, const unsigned char **binaries,
+    cl_int *binary_status, cl_int *errcode_ret) {
+  return ::clCreateProgramWithBinary(context, num_devices, device_list, lengths,
+                                     binaries, binary_status, errcode_ret);
+}
+
+cl_int OCLWrapper::clRetainProgram(cl_program program) {
+  return ::clRetainProgram(program);
+}
+
+cl_int OCLWrapper::clReleaseProgram(cl_program program) {
+  return ::clReleaseProgram(program);
+}
+
+cl_int OCLWrapper::clBuildProgram(
+    cl_program program, cl_uint num_devices, const cl_device_id *device_list,
+    const char *options,
+    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+    void *user_data) {
+  return ::clBuildProgram(program, num_devices, device_list, options,
+                          pfn_notify, user_data);
+}
+
+cl_int OCLWrapper::clCompileProgram(
+    cl_program program, cl_uint num_devices, const cl_device_id *device_list,
+    const char *options, cl_uint num_input_headers,
+    const cl_program *input_headers, const char **header_include_names,
+    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+    void *user_data) {
+  return ::clCompileProgram(program, num_devices, device_list, options,
+                            num_input_headers, input_headers,
+                            header_include_names, pfn_notify, user_data);
+}
+
+cl_program OCLWrapper::clLinkProgram(
+    cl_context context, cl_uint num_devices, const cl_device_id *device_list,
+    const char *options, cl_uint num_input_programs,
+    const cl_program *input_programs,
+    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+    void *user_data, cl_int *errcode_ret) {
+  return ::clLinkProgram(context, num_devices, device_list, options,
+                         num_input_programs, input_programs, pfn_notify,
+                         user_data, errcode_ret);
+}
+
+cl_int OCLWrapper::clUnloadCompiler(void) { return ::clUnloadCompiler(); }
+
+cl_int OCLWrapper::clGetProgramInfo(cl_program program,
+                                    cl_program_info param_name,
+                                    size_t param_value_size, void *param_value,
+                                    size_t *param_value_size_ret) {
+  return ::clGetProgramInfo(program, param_name, param_value_size, param_value,
+                            param_value_size_ret);
+}
+
+cl_int OCLWrapper::clGetProgramBuildInfo(
+    cl_program program, cl_device_id device, cl_program_build_info param_name,
+    size_t param_value_size, void *param_value, size_t *param_value_size_ret) {
+  return ::clGetProgramBuildInfo(program, device, param_name, param_value_size,
+                                 param_value, param_value_size_ret);
+}
+
+cl_kernel OCLWrapper::clCreateKernel(cl_program program,
+                                     const char *kernel_name,
+                                     cl_int *errcode_ret) {
+  return ::clCreateKernel(program, kernel_name, errcode_ret);
+}
+
+cl_int OCLWrapper::clCreateKernelsInProgram(cl_program program,
+                                            cl_uint num_kernels,
+                                            cl_kernel *kernels,
+                                            cl_uint *num_kernels_ret) {
+  return ::clCreateKernelsInProgram(program, num_kernels, kernels,
+                                    num_kernels_ret);
+}
+
+cl_int OCLWrapper::clRetainKernel(cl_kernel kernel) {
+  return ::clRetainKernel(kernel);
+}
+
+cl_int OCLWrapper::clReleaseKernel(cl_kernel kernel) {
+  return ::clReleaseKernel(kernel);
+}
+
+cl_int OCLWrapper::clSetKernelArg(cl_kernel kernel, cl_uint arg_index,
+                                  size_t arg_size, const void *arg_value) {
+  return ::clSetKernelArg(kernel, arg_index, arg_size, arg_value);
+}
+
+cl_int OCLWrapper::clGetKernelInfo(cl_kernel kernel, cl_kernel_info param_name,
+                                   size_t param_value_size, void *param_value,
+                                   size_t *param_value_size_ret) {
+  return ::clGetKernelInfo(kernel, param_name, param_value_size, param_value,
+                           param_value_size_ret);
+}
+
+cl_int OCLWrapper::clGetKernelWorkGroupInfo(
+    cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name,
+    size_t param_value_size, void *param_value, size_t *param_value_size_ret) {
+  return ::clGetKernelWorkGroupInfo(kernel, device, param_name,
+                                    param_value_size, param_value,
+                                    param_value_size_ret);
+}
+
+cl_int OCLWrapper::clWaitForEvents(cl_uint num_events,
+                                   const cl_event *event_list) {
+  return ::clWaitForEvents(num_events, event_list);
+}
+
+cl_int OCLWrapper::clGetEventInfo(cl_event evnt, cl_event_info param_name,
+                                  size_t param_value_size, void *param_value,
+                                  size_t *param_value_size_ret) {
+  return ::clGetEventInfo(evnt, param_name, param_value_size, param_value,
+                          param_value_size_ret);
+}
+
+cl_int OCLWrapper::clRetainEvent(cl_event evnt) {
+  return ::clRetainEvent(evnt);
+}
+
+cl_int OCLWrapper::clReleaseEvent(cl_event evnt) {
+  return ::clReleaseEvent(evnt);
+}
+
+cl_int OCLWrapper::clGetEventProfilingInfo(cl_event evnt,
+                                           cl_profiling_info param_name,
+                                           size_t param_value_size,
+                                           void *param_value,
+                                           size_t *param_value_size_ret) {
+  return ::clGetEventProfilingInfo(evnt, param_name, param_value_size,
+                                   param_value, param_value_size_ret);
+}
+
+cl_int OCLWrapper::clFlush(cl_command_queue command_queue) {
+  return ::clFlush(command_queue);
+}
+
+cl_int OCLWrapper::clFinish(cl_command_queue command_queue) {
+  return ::clFinish(command_queue);
+}
+
+cl_int OCLWrapper::clEnqueueReadBuffer(cl_command_queue command_queue,
+                                       cl_mem buffer, cl_bool blocking_read,
+                                       size_t offset, size_t cb, void *ptr,
+                                       cl_uint num_events_in_wait_list,
+                                       const cl_event *event_wait_list,
+                                       cl_event *evnt) {
+  return ::clEnqueueReadBuffer(command_queue, buffer, blocking_read, offset, cb,
+                               ptr, num_events_in_wait_list, event_wait_list,
+                               evnt);
+}
+
+cl_int OCLWrapper::clEnqueueWriteBuffer(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
+    size_t offset, size_t cb, const void *ptr, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *evnt) {
+  return ::clEnqueueWriteBuffer(command_queue, buffer, blocking_write, offset,
+                                cb, ptr, num_events_in_wait_list,
+                                event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueCopyBuffer(cl_command_queue command_queue,
+                                       cl_mem src_buffer, cl_mem dst_buffer,
+                                       size_t src_offset, size_t dst_offset,
+                                       size_t cb,
+                                       cl_uint num_events_in_wait_list,
+                                       const cl_event *event_wait_list,
+                                       cl_event *evnt) {
+  return ::clEnqueueCopyBuffer(command_queue, src_buffer, dst_buffer,
+                               src_offset, dst_offset, cb,
+                               num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueReadBufferRect(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
+    const size_t *buffer_origin, const size_t *host_origin,
+    const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
+    size_t host_row_pitch, size_t host_slice_pitch, void *ptr,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *evnt) {
+  return ::clEnqueueReadBufferRect(
+      command_queue, buffer, blocking_read, buffer_origin, host_origin, region,
+      buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch,
+      ptr, num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueWriteBufferRect(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
+    const size_t *buffer_origin, const size_t *host_origin,
+    const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
+    size_t host_row_pitch, size_t host_slice_pitch, const void *ptr,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *evnt) {
+  return ::clEnqueueWriteBufferRect(
+      command_queue, buffer, blocking_write, buffer_origin, host_origin, region,
+      buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch,
+      ptr, num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueCopyBufferRect(
+    cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
+    const size_t *src_origin, const size_t *dst_origin, const size_t *region,
+    size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch,
+    size_t dst_slice_pitch, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *evnt) {
+  return ::clEnqueueCopyBufferRect(
+      command_queue, src_buffer, dst_buffer, src_origin, dst_origin, region,
+      src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch,
+      num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueReadImage(
+    cl_command_queue command_queue, cl_mem image, cl_bool blocking_read,
+    const size_t *origin, const size_t *region, size_t row_pitch,
+    size_t slice_pitch, void *ptr, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *evnt) {
+  return ::clEnqueueReadImage(command_queue, image, blocking_read, origin,
+                              region, row_pitch, slice_pitch, ptr,
+                              num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueWriteImage(
+    cl_command_queue command_queue, cl_mem image, cl_bool blocking_write,
+    const size_t *origin, const size_t *region, size_t input_row_pitch,
+    size_t input_slice_pitch, const void *ptr, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *evnt) {
+  return ::clEnqueueWriteImage(command_queue, image, blocking_write, origin,
+                               region, input_row_pitch, input_slice_pitch, ptr,
+                               num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueCopyImage(
+    cl_command_queue command_queue, cl_mem src_image, cl_mem dst_image,
+    const size_t *src_origin, const size_t *dst_origin, const size_t *region,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *evnt) {
+  return ::clEnqueueCopyImage(command_queue, src_image, dst_image, src_origin,
+                              dst_origin, region, num_events_in_wait_list,
+                              event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueCopyImageToBuffer(
+    cl_command_queue command_queue, cl_mem src_image, cl_mem dst_buffer,
+    const size_t *src_origin, const size_t *region, size_t dst_offset,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *evnt) {
+  return ::clEnqueueCopyImageToBuffer(
+      command_queue, src_image, dst_buffer, src_origin, region, dst_offset,
+      num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueCopyBufferToImage(
+    cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image,
+    size_t src_offset, const size_t *dst_origin, const size_t *region,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *evnt) {
+  return ::clEnqueueCopyBufferToImage(
+      command_queue, src_buffer, dst_image, src_offset, dst_origin, region,
+      num_events_in_wait_list, event_wait_list, evnt);
+}
+
+void *OCLWrapper::clEnqueueMapBuffer(cl_command_queue command_queue,
+                                     cl_mem buffer, cl_bool blocking_map,
+                                     cl_map_flags map_flags, size_t offset,
+                                     size_t cb, cl_uint num_events_in_wait_list,
+                                     const cl_event *event_wait_list,
+                                     cl_event *evnt, cl_int *errcode_ret) {
+  return ::clEnqueueMapBuffer(command_queue, buffer, blocking_map, map_flags,
+                              offset, cb, num_events_in_wait_list,
+                              event_wait_list, evnt, errcode_ret);
+}
+
+void *OCLWrapper::clEnqueueMapImage(
+    cl_command_queue command_queue, cl_mem image, cl_bool blocking_map,
+    cl_map_flags map_flags, const size_t *origin, const size_t *region,
+    size_t *image_row_pitch, size_t *image_slice_pitch,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *evnt, cl_int *errcode_ret) {
+  return ::clEnqueueMapImage(command_queue, image, blocking_map, map_flags,
+                             origin, region, image_row_pitch, image_slice_pitch,
+                             num_events_in_wait_list, event_wait_list, evnt,
+                             errcode_ret);
+}
+
+cl_int OCLWrapper::clEnqueueUnmapMemObject(cl_command_queue command_queue,
+                                           cl_mem memobj, void *mapped_ptr,
+                                           cl_uint num_events_in_wait_list,
+                                           const cl_event *event_wait_list,
+                                           cl_event *evnt) {
+  return ::clEnqueueUnmapMemObject(command_queue, memobj, mapped_ptr,
+                                   num_events_in_wait_list, event_wait_list,
+                                   evnt);
+}
+
+cl_int OCLWrapper::clEnqueueNDRangeKernel(
+    cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim,
+    const size_t *global_work_offset, const size_t *global_work_size,
+    const size_t *local_work_size, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *evnt) {
+  return ::clEnqueueNDRangeKernel(
+      command_queue, kernel, work_dim, global_work_offset, global_work_size,
+      local_work_size, num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueTask(cl_command_queue command_queue,
+                                 cl_kernel kernel,
+                                 cl_uint num_events_in_wait_list,
+                                 const cl_event *event_wait_list,
+                                 cl_event *evnt) {
+#if defined(CL_VERSION_2_0)
+  static size_t const globalWorkSize[3] = {1, 0, 0};
+  static size_t const localWorkSize[3] = {1, 0, 0};
+
+  return ::clEnqueueNDRangeKernel(
+      command_queue, kernel, 1, NULL, globalWorkSize, localWorkSize,
+      num_events_in_wait_list, event_wait_list, evnt);
+#else
+  return ::clEnqueueTask(command_queue, kernel, num_events_in_wait_list,
+                         event_wait_list, evnt);
+#endif
+}
+
+cl_int OCLWrapper::clEnqueueNativeKernel(
+    cl_command_queue command_queue, void(CL_CALLBACK *user_func)(void *),
+    void *args, size_t cb_args, cl_uint num_mem_objects, const cl_mem *mem_list,
+    const void **args_mem_loc, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *evnt) {
+  return ::clEnqueueNativeKernel(
+      command_queue, user_func, args, cb_args, num_mem_objects, mem_list,
+      args_mem_loc, num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueMarker(cl_command_queue command_queue,
+                                   cl_event *evnt) {
+  return ::clEnqueueMarker(command_queue, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueMarkerWithWaitList(cl_command_queue command_queue,
+                                               cl_uint num_events_in_wait_list,
+                                               const cl_event *event_wait_list,
+                                               cl_event *evnt) {
+  return ::clEnqueueMarkerWithWaitList(command_queue, num_events_in_wait_list,
+                                       event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clEnqueueWaitForEvents(cl_command_queue command_queue,
+                                          cl_uint num_events,
+                                          const cl_event *event_list) {
+  return ::clEnqueueWaitForEvents(command_queue, num_events, event_list);
+}
+
+cl_int OCLWrapper::clEnqueueBarrier(cl_command_queue command_queue) {
+  return ::clEnqueueBarrier(command_queue);
+}
+
+void *OCLWrapper::clGetExtensionFunctionAddress(const char *func_name) {
+  return ::clGetExtensionFunctionAddress(func_name);
+}
+
+cl_mem OCLWrapper::clCreateImage(cl_context context, cl_mem_flags flags,
+                                 const cl_image_format *image_format,
+                                 const cl_image_desc *image_desc,
+                                 void *host_ptr, cl_int *errcode_ret) {
+  return ::clCreateImage(context, flags, image_format, image_desc, host_ptr,
+                         errcode_ret);
+}
+
+cl_mem OCLWrapper::clCreateSubBuffer(cl_mem mem, cl_mem_flags flags,
+                                     cl_buffer_create_type buffer_create_type,
+                                     const void *buffer_create_info,
+                                     cl_int *errcode_ret) {
+  return ::clCreateSubBuffer(mem, flags, buffer_create_type, buffer_create_info,
+                             errcode_ret);
+}
+
+cl_int OCLWrapper::clSetEventCallback(
+    cl_event event, cl_int command_exec_callback_type,
+    void(CL_CALLBACK *pfn_event_notify)(cl_event event,
+                                        cl_int event_command_exec_status,
+                                        void *user_data),
+    void *user_data) {
+  return ::clSetEventCallback(event, command_exec_callback_type,
+                              pfn_event_notify, user_data);
+}
+
+cl_int OCLWrapper::clEnqueueFillImage(
+    cl_command_queue command_queue, cl_mem image, void *ptr,
+    const size_t *origin, const size_t *region, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *evnt) {
+  return ::clEnqueueFillImage(command_queue, image, ptr, origin, region,
+                              num_events_in_wait_list, event_wait_list, evnt);
+}
+
+cl_int OCLWrapper::clUnloadPlatformAMD(cl_platform_id id) {
+  if (clUnloadPlatformAMD_ptr) return clUnloadPlatformAMD_ptr(id);
+  return CL_SUCCESS;
+}
+cl_int OCLWrapper::clEnqueueWaitSignalAMD(cl_command_queue command_queue,
+                                          cl_mem mem_object, cl_uint value,
+                                          cl_uint num_events,
+                                          const cl_event *event_wait_list,
+                                          cl_event *event) {
+  return clEnqueueWaitSignalAMD_ptr(command_queue, mem_object, value,
+                                    num_events, event_wait_list, event);
+}
+
+cl_int OCLWrapper::clEnqueueWriteSignalAMD(cl_command_queue command_queue,
+                                           cl_mem mem_object, cl_uint value,
+                                           cl_ulong offset, cl_uint num_events,
+                                           const cl_event *event_list,
+                                           cl_event *event) {
+  return clEnqueueWriteSignalAMD_ptr(command_queue, mem_object, value, offset,
+                                     num_events, event_list, event);
+}
+
+cl_int OCLWrapper::clEnqueueMakeBuffersResidentAMD(
+    cl_command_queue command_queue, cl_uint num_mem_objs, cl_mem *mem_objects,
+    cl_bool blocking_make_resident, cl_bus_address_amd *bus_addresses,
+    cl_uint num_events, const cl_event *event_list, cl_event *event) {
+  return clEnqueueMakeBuffersResidentAMD_ptr(
+      command_queue, num_mem_objs, mem_objects, blocking_make_resident,
+      bus_addresses, num_events, event_list, event);
+}
+
+cl_int OCLWrapper::clEnqueueMigrateMemObjects(cl_command_queue command_queue,
+                                              cl_uint num_mem_objects,
+                                              const cl_mem *mem_objects,
+                                              cl_mem_migration_flags flags,
+                                              cl_uint num_events_in_wait_list,
+                                              const cl_event *event_wait_list,
+                                              cl_event *event) {
+  return ::clEnqueueMigrateMemObjects(
+      command_queue, num_mem_objects, mem_objects, flags,
+      num_events_in_wait_list, event_wait_list, event);
+}
+
+cl_int OCLWrapper::clGetGLContextInfoKHR(
+    const cl_context_properties *properties, cl_gl_context_info param_name,
+    size_t param_value_size, void *param_value, size_t *param_value_size_ret) {
+  return (*clGetGLContextInfoKHR_ptr)(properties, param_name, param_value_size,
+                                      param_value, param_value_size_ret);
+}
+
+cl_mem OCLWrapper::clCreateFromGLBuffer(cl_context context, cl_mem_flags flags,
+                                        unsigned int bufobj, int *errcode_ret) {
+  return (*clCreateFromGLBuffer_ptr)(context, flags, bufobj, errcode_ret);
+}
+
+cl_mem OCLWrapper::clCreateFromGLTexture(cl_context context, cl_mem_flags flags,
+                                         unsigned int texture_target,
+                                         int miplevel, unsigned int texture,
+                                         cl_int *errcode_ret) {
+  return (*clCreateFromGLTexture_ptr)(context, flags, texture_target, miplevel,
+                                      texture, errcode_ret);
+}
+
+cl_mem OCLWrapper::clCreateFromGLTexture2D(cl_context context,
+                                           cl_mem_flags flags,
+                                           unsigned int texture_target,
+                                           int miplevel, unsigned int texture,
+                                           cl_int *errcode_ret) {
+  return (*clCreateFromGLTexture2D_ptr)(context, flags, texture_target,
+                                        miplevel, texture, errcode_ret);
+}
+
+cl_mem OCLWrapper::clCreateFromGLRenderbuffer(cl_context context,
+                                              cl_mem_flags flags,
+                                              unsigned int renderbuffer,
+                                              cl_int *errcode_ret) {
+  return (*clCreateFromGLRenderbuffer_ptr)(context, flags, renderbuffer,
+                                           errcode_ret);
+}
+
+cl_int OCLWrapper::clGetGLObjectInfo(cl_mem memobj,
+                                     cl_gl_object_type *gl_object_type,
+                                     unsigned int *gl_object_name) {
+  return (*clGetGLObjectInfo_ptr)(memobj, gl_object_type, gl_object_name);
+}
+
+cl_int OCLWrapper::clGetGLTextureInfo(cl_mem memobj,
+                                      cl_gl_texture_info param_name,
+                                      size_t param_value_size,
+                                      void *param_value,
+                                      size_t *param_value_size_ret) {
+  return (*clGetGLTextureInfo_ptr)(memobj, param_name, param_value_size,
+                                   param_value, param_value_size_ret);
+}
+
+cl_int OCLWrapper::clEnqueueAcquireGLObjects(cl_command_queue command_queue,
+                                             cl_uint num_objects,
+                                             const cl_mem *mem_objects,
+                                             cl_uint num_events_in_wait_list,
+                                             const cl_event *event_wait_list,
+                                             cl_event *event) {
+  return (*clEnqueueAcquireGLObjects_ptr)(command_queue, num_objects,
+                                          mem_objects, num_events_in_wait_list,
+                                          event_wait_list, event);
+}
+
+cl_int OCLWrapper::clEnqueueReleaseGLObjects(cl_command_queue command_queue,
+                                             cl_uint num_objects,
+                                             const cl_mem *mem_objects,
+                                             cl_uint num_events_in_wait_list,
+                                             const cl_event *event_wait_list,
+                                             cl_event *event) {
+  return (*clEnqueueReleaseGLObjects_ptr)(command_queue, num_objects,
+                                          mem_objects, num_events_in_wait_list,
+                                          event_wait_list, event);
+}
+
+#if defined(CL_VERSION_2_0)
+cl_command_queue OCLWrapper::clCreateCommandQueueWithProperties(
+    cl_context context, cl_device_id device,
+    const cl_queue_properties *properties, cl_int *errcode_ret) {
+  return ::clCreateCommandQueueWithProperties(context, device, properties,
+                                              errcode_ret);
+}
+
+void *OCLWrapper::clSVMAlloc(cl_context context, cl_svm_mem_flags flags,
+                             size_t size, cl_uint alignment) {
+  return ::clSVMAlloc(context, flags, size, alignment);
+}
+
+void OCLWrapper::clSVMFree(cl_context context, void *svm_pointer) {
+  return ::clSVMFree(context, svm_pointer);
+}
+
+cl_int OCLWrapper::clEnqueueSVMMap(cl_command_queue command_queue,
+                                   cl_bool blocking_map, cl_map_flags flags,
+                                   void *svm_ptr, size_t size,
+                                   cl_uint num_events_in_wait_list,
+                                   const cl_event *event_wait_list,
+                                   cl_event *event) {
+  return ::clEnqueueSVMMap(command_queue, blocking_map, flags, svm_ptr, size,
+                           num_events_in_wait_list, event_wait_list, event);
+}
+
+cl_int OCLWrapper::clEnqueueSVMUnmap(cl_command_queue command_queue,
+                                     void *svm_ptr,
+                                     cl_uint num_events_in_wait_list,
+                                     const cl_event *event_wait_list,
+                                     cl_event *event) {
+  return ::clEnqueueSVMUnmap(command_queue, svm_ptr, num_events_in_wait_list,
+                             event_wait_list, event);
+}
+cl_int OCLWrapper::clEnqueueSVMMemFill(cl_command_queue command_queue,
+                                       void *svm_ptr, const void *pattern,
+                                       size_t pattern_size, size_t size,
+                                       cl_uint num_events_in_wait_list,
+                                       const cl_event *event_wait_list,
+                                       cl_event *event) {
+  return ::clEnqueueSVMMemFill(command_queue, svm_ptr, pattern, pattern_size,
+                               size, num_events_in_wait_list, event_wait_list,
+                               event);
+}
+
+cl_int OCLWrapper::clSetKernelArgSVMPointer(cl_kernel kernel, cl_uint arg_index,
+                                            const void *arg_value) {
+  return ::clSetKernelArgSVMPointer(kernel, arg_index, arg_value);
+}
+
+cl_mem OCLWrapper::clCreatePipe(cl_context context, cl_mem_flags flags,
+                                cl_uint packet_size, cl_uint pipe_max_packets,
+                                const cl_pipe_properties *properties,
+                                cl_int *errcode_ret) {
+  return ::clCreatePipe(context, flags, packet_size, pipe_max_packets,
+                        properties, errcode_ret);
+}
+
+cl_int OCLWrapper::clGetPipeInfo(cl_mem pipe, cl_pipe_info param_name,
+                                 size_t param_value_size, void *param_value,
+                                 size_t *param_value_size_ret) {
+  return ::clGetPipeInfo(pipe, param_name, param_value_size, param_value,
+                         param_value_size_ret);
+}
+
+#endif
+
+cl_perfcounter_amd OCLWrapper::clCreatePerfCounterAMD(
+    cl_device_id device, cl_perfcounter_property *properties,
+    cl_int *errcode_ret) {
+  return (*clCreatePerfCounterAMD_ptr)(device, properties, errcode_ret);
+}
+
+cl_int OCLWrapper::clEnqueueBeginPerfCounterAMD(
+    cl_command_queue command_queue, cl_uint num_perf_counters,
+    cl_perfcounter_amd *perf_counters, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list, cl_event *event) {
+  return (*clEnqueueBeginPerfCounterAMD_ptr)(
+      command_queue, num_perf_counters, perf_counters, num_events_in_wait_list,
+      event_wait_list, event);
+}
+
+cl_int OCLWrapper::clEnqueueEndPerfCounterAMD(cl_command_queue command_queue,
+                                              cl_uint num_perf_counters,
+                                              cl_perfcounter_amd *perf_counters,
+                                              cl_uint num_events_in_wait_list,
+                                              const cl_event *event_wait_list,
+                                              cl_event *event) {
+  return (*clEnqueueEndPerfCounterAMD_ptr)(
+      command_queue, num_perf_counters, perf_counters, num_events_in_wait_list,
+      event_wait_list, event);
+}
+
+cl_int OCLWrapper::clGetPerfCounterInfoAMD(cl_perfcounter_amd perf_counter,
+                                           cl_perfcounter_info param_name,
+                                           size_t param_value_size,
+                                           void *param_value,
+                                           size_t *param_value_size_ret) {
+  return (*clGetPerfCounterInfoAMD_ptr)(perf_counter, param_name,
+                                        param_value_size, param_value,
+                                        param_value_size_ret);
+}
+
+cl_int OCLWrapper::clReleasePerfCounterAMD(cl_perfcounter_amd perf_counter) {
+  return (*clReleasePerfCounterAMD_ptr)(perf_counter);
+}
+
+cl_int OCLWrapper::clRetainPerfCounterAMD(cl_perfcounter_amd perf_counter) {
+  return (*clRetainPerfCounterAMD_ptr)(perf_counter);
+}
+
+cl_int OCLWrapper::clSetDeviceClockModeAMD(
+    cl_device_id device,
+    cl_set_device_clock_mode_input_amd set_clock_mode_input,
+    cl_set_device_clock_mode_output_amd *set_clock_mode_output) {
+  return (*clSetDeviceClockModeAMD_ptr)(device, set_clock_mode_input,
+                                        set_clock_mode_output);
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/common/Timer.cpp b/projects/clr/opencl/tests/ocltst/module/common/Timer.cpp
new file mode 100644
index 0000000000..4ee095085f
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/common/Timer.cpp
@@ -0,0 +1,112 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "Timer.h"
+
+#ifdef ATI_OS_WIN
+#include <windows.h>
+#endif
+
+#ifdef ATI_OS_LINUX
+#include <time.h>
+#define NANOSECONDS_PER_SEC 1000000000
+#endif
+
+CPerfCounter::CPerfCounter() : _clocks(0), _start(0) {
+#ifdef ATI_OS_WIN
+
+  QueryPerformanceFrequency((LARGE_INTEGER *)&_freq);
+
+#endif
+
+#ifdef ATI_OS_LINUX
+  _freq = NANOSECONDS_PER_SEC;
+#endif
+}
+
+CPerfCounter::~CPerfCounter() {
+  // EMPTY!
+}
+
+void CPerfCounter::Start(void) {
+#ifdef ATI_OS_WIN
+
+  if (_start) {
+    MessageBox(NULL, "Bad Perf Counter Start", "Error", MB_OK);
+    exit(0);
+  }
+  QueryPerformanceCounter((LARGE_INTEGER *)&_start);
+
+#endif
+#ifdef ATI_OS_LINUX
+
+  struct timespec s;
+  clock_gettime(CLOCK_MONOTONIC, &s);
+  _start = (i64)s.tv_sec * NANOSECONDS_PER_SEC + (i64)s.tv_nsec;
+
+#endif
+}
+
+void CPerfCounter::Stop(void) {
+  i64 n;
+
+#ifdef ATI_OS_WIN
+
+  if (!_start) {
+    MessageBox(NULL, "Bad Perf Counter Stop", "Error", MB_OK);
+    exit(0);
+  }
+
+  QueryPerformanceCounter((LARGE_INTEGER *)&n);
+
+#endif
+#ifdef ATI_OS_LINUX
+
+  struct timespec s;
+  clock_gettime(CLOCK_MONOTONIC, &s);
+  n = (i64)s.tv_sec * NANOSECONDS_PER_SEC + (i64)s.tv_nsec;
+
+#endif
+
+  n -= _start;
+  _start = 0;
+  _clocks += n;
+}
+
+void CPerfCounter::Reset(void) {
+#ifdef ATI_OS_WIN
+  if (_start) {
+    MessageBox(NULL, "Bad Perf Counter Reset", "Error", MB_OK);
+    exit(0);
+  }
+#endif
+  _clocks = 0;
+}
+
+double CPerfCounter::GetElapsedTime(void) {
+#ifdef ATI_OS_WIN
+  if (_start) {
+    MessageBox(NULL, "Trying to get time while still running.", "Error", MB_OK);
+    exit(0);
+  }
+#endif
+
+  return (double)_clocks / (double)_freq;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/common/Timer.h b/projects/clr/opencl/tests/ocltst/module/common/Timer.h
new file mode 100644
index 0000000000..fd56fe3b0d
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/common/Timer.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _TIMER_H_
+#define _TIMER_H_
+
+#ifdef ATI_OS_WIN
+typedef __int64 i64;
+#endif
+#ifdef ATI_OS_LINUX
+typedef long long i64;
+#endif
+
+class CPerfCounter {
+ public:
+  CPerfCounter();
+  ~CPerfCounter();
+  void Start(void);
+  void Stop(void);
+  void Reset(void);
+  double GetElapsedTime(void);
+
+ private:
+  i64 _freq;
+  i64 _clocks;
+  i64 _start;
+};
+
+#endif  // _TIMER_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11Common.cpp b/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11Common.cpp
new file mode 100644
index 0000000000..1219157d44
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11Common.cpp
@@ -0,0 +1,236 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLDX11Common.h"
+
+#define D3D_FEATURE_LEVEL_11_1 0xb100
+
+#define INITPFN(x)                                                             \
+  x = (x##_fn)clGetExtensionFunctionAddressForPlatform(platform_, #x);         \
+  if ((x) == NULL) {                                                           \
+    char* buf = (char*)malloc(4096);                                           \
+    _errorFlag = true;                                                         \
+    int rc = snprintf(buf, 4096, "Failed to get function pointer for %s", #x); \
+    assert(rc >= 0 && rc < (int)4096);                                         \
+    printf("%s:%d - %s\n", __FILE__, __LINE__, buf);                           \
+    _errorMsg = std::string(buf);                                              \
+    _crcword += 1;                                                             \
+    free(buf);                                                                 \
+    return;                                                                    \
+  }
+
+OCLDX11Common::OCLDX11Common() : OCLTestImp() {
+  clGetDeviceIDsFromD3D11KHR = NULL;
+  clCreateFromD3D11BufferKHR = NULL;
+  clCreateFromD3D11Texture2DKHR = NULL;
+  clCreateFromD3D11Texture3DKHR = NULL;
+  clEnqueueAcquireD3D11ObjectsKHR = NULL;
+  clEnqueueReleaseD3D11ObjectsKHR = NULL;
+  clGetPlaneFromImageAMD = NULL;
+}
+
+OCLDX11Common::~OCLDX11Common() {}
+
+void OCLDX11Common::ExtensionCheck() {
+  cl_int result = CL_SUCCESS;
+  char extensions[1024];
+
+  result = _wrapper->clGetPlatformInfo(platform_, CL_PLATFORM_EXTENSIONS,
+                                       sizeof(extensions), extensions, NULL);
+  CHECK_RESULT(result != CL_SUCCESS, "Failed to list platform extensions.");
+
+  extensionsAvailable =
+      strstr(extensions, "cl_khr_d3d11_sharing") ? true : false;
+  if (!extensionsAvailable) {
+    printf("cl_khr_d3d11_sharing extension is required for this test!\n");
+  }
+
+  OSVERSIONINFOEX versionInfo = {0};
+  versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
+  versionInfo.dwMajorVersion = 6;
+
+  DWORDLONG conditionMask = 0;
+  VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, VER_GREATER_EQUAL);
+  if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION, conditionMask)) {
+    CHECK_RESULT(!extensionsAvailable,
+                 "Extension should be exported on Windows >= 6");
+  } else {
+    CHECK_RESULT(extensionsAvailable,
+                 "Extension should not be exported on Windows < 6");
+  }
+
+  result = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_EXTENSIONS,
+                                     sizeof(extensions), extensions, NULL);
+  CHECK_RESULT(result != CL_SUCCESS, "Failed to list device extensions.");
+
+  extensionsAvailable = strstr(extensions, "cl_amd_planar_yuv") ? true : false;
+  if (!extensionsAvailable) {
+    printf("cl_amd_planar_yuv extension is required for this test!\n");
+  }
+}
+
+void OCLDX11Common::open(unsigned int test, char* units, double& conversion,
+                         unsigned int deviceId) {
+  // OpenCL Initialization
+  // OCLTestImp::open(test, units, conversion, deviceId);
+  BaseTestImp::open();
+  devices_ = 0;
+  deviceCount_ = 0;
+  context_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  _queue = 0;
+  _deviceId = deviceId;
+
+  dxD3D11Context = NULL;
+  dxD3D11Device = NULL;
+
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test (%d)", error_);
+
+  cl_uint numPlatforms = 0;
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clGetPlatformIDs failed");
+  CHECK_RESULT((numPlatforms == 0), "No platform found");
+
+  cl_platform_id* platforms = new cl_platform_id[numPlatforms];
+  error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+
+  platform_ = platforms[_platformIndex];
+  CHECK_RESULT((platform_ == 0), "AMD Platform not found");
+
+  delete[] platforms;
+
+  error_ = _wrapper->clGetDeviceIDs(platform_, type_, 0, NULL, &deviceCount_);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed");
+
+  devices_ = new cl_device_id[deviceCount_];
+  error_ =
+      _wrapper->clGetDeviceIDs(platform_, type_, deviceCount_, devices_, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed");
+
+  ExtensionCheck();
+  if (!extensionsAvailable) {
+    return;
+  }
+
+  // extract function pointers for exported functions
+  INITPFN(clGetDeviceIDsFromD3D11KHR);
+  INITPFN(clCreateFromD3D11BufferKHR);
+  INITPFN(clCreateFromD3D11Texture2DKHR);
+  INITPFN(clCreateFromD3D11Texture3DKHR);
+  INITPFN(clEnqueueAcquireD3D11ObjectsKHR);
+  INITPFN(clEnqueueReleaseD3D11ObjectsKHR);
+  INITPFN(clGetPlaneFromImageAMD);
+
+  char name[1024] = {0};
+  size_t size = 0;
+
+  if (deviceId >= deviceCount_) {
+    _errorFlag = true;
+    return;
+  }
+
+  HRESULT hr = S_OK;
+
+  UINT createDeviceFlags = 0;
+
+  D3D_FEATURE_LEVEL featureLevels[] = {
+      (D3D_FEATURE_LEVEL)D3D_FEATURE_LEVEL_11_1, D3D_FEATURE_LEVEL_11_0
+
+  };
+  D3D_FEATURE_LEVEL featureLevel;
+  // Create only the device, not the swapchain. We can't create the swapchain
+  // anyways without a handle to a window we explicitly own
+  hr = D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_HARDWARE, NULL,
+                         createDeviceFlags, featureLevels,
+                         _countof(featureLevels), D3D11_SDK_VERSION,
+                         &dxD3D11Device, &featureLevel, &dxD3D11Context);
+
+  if (FAILED(hr)) {
+    hr = D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_HARDWARE, NULL,
+                           createDeviceFlags, featureLevels + 1,
+                           _countof(featureLevels) - 1, D3D11_SDK_VERSION,
+                           &dxD3D11Device, &featureLevel, &dxD3D11Context);
+  }
+  if (FAILED(hr)) {
+    hr = D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_SOFTWARE, NULL,
+                           createDeviceFlags, featureLevels,
+                           _countof(featureLevels), D3D11_SDK_VERSION,
+                           &dxD3D11Device, &featureLevel, &dxD3D11Context);
+  }
+
+  if (FAILED(hr)) {
+    hr = D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_SOFTWARE, NULL,
+                           createDeviceFlags, featureLevels + 1,
+                           _countof(featureLevels) - 1, D3D11_SDK_VERSION,
+                           &dxD3D11Device, &featureLevel, &dxD3D11Context);
+  }
+
+  cl_int status = 0;
+  cl_context_properties cps[7] = {
+      CL_CONTEXT_D3D11_DEVICE_KHR,
+      (cl_context_properties)(ID3D11Device*)dxD3D11Device,
+      CL_CONTEXT_INTEROP_USER_SYNC,
+      CL_FALSE,
+      CL_CONTEXT_PLATFORM,
+      (cl_context_properties)platform_,
+      0};
+  cl_context_properties* cprops = (NULL == platform_) ? NULL : cps;
+
+  cl_uint deviceListSize = 0;
+  clGetDeviceIDsFromD3D11KHR(platform_, CL_D3D11_DEVICE_KHR, dxD3D11Device,
+                             CL_PREFERRED_DEVICES_FOR_D3D11_KHR, 0, NULL,
+                             &deviceListSize);
+
+  std::vector<cl_device_id> devices;
+  devices.resize(deviceListSize);
+  clGetDeviceIDsFromD3D11KHR(platform_, CL_D3D11_DEVICE_KHR, dxD3D11Device,
+                             CL_PREFERRED_DEVICES_FOR_D3D11_KHR, deviceListSize,
+                             &devices[0], NULL);
+
+  bool ret = false;
+  // Check that current device can be associated with OpenGL context
+  for (unsigned int i = 0; i < deviceListSize; i++) {
+    if (devices[i] == devices_[_deviceId]) {
+      ret = true;
+      break;
+    }
+  }
+  if (ret) {
+    char buf[2000];
+    _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS,
+                              sizeof(buf), buf, NULL);
+
+    context_ =
+        clCreateContext(cprops, 1, &devices_[_deviceId], NULL, NULL, &status);
+    _queue = clCreateCommandQueue(context_, devices_[_deviceId], 0, &status);
+  }
+  CHECK_RESULT((ret != true), "Can't find D3D device!");
+}
+
+unsigned int OCLDX11Common::close(void) {
+  clReleaseCommandQueue(_queue);
+  unsigned int retVal = OCLTestImp::close();
+  // deleteDXDevice(hDX_);
+  if (dxD3D11Context) dxD3D11Context->Release();
+  if (dxD3D11Device) dxD3D11Device->Release();
+  return retVal;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11Common.h b/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11Common.h
new file mode 100644
index 0000000000..0897cd6ad4
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11Common.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_DX11_COMMON_H_
+#define _OCL_DX11_COMMON_H_
+
+#include <CL/cl.h>
+#include <CL/cl_d3d11.h>
+
+#include "OCLTestImp.h"
+#include "d3d11.h"
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL* clGetPlaneFromImageAMD_fn)(
+    cl_context /* context */, cl_mem /* mem */, cl_uint /* plane */,
+    cl_int* /* errcode_ret */);
+
+class OCLDX11Common : public OCLTestImp {
+ public:
+  // S///////////////////////////////////////
+  // private initialization and clean-up //
+  /////////////////////////////////////////
+  OCLDX11Common();
+  virtual ~OCLDX11Common();
+  ///////////////////////
+  // virtual interface //
+  ///////////////////////
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual unsigned int close(void);
+
+ protected:
+  bool extensionsAvailable;
+
+  ID3D11Device* dxD3D11Device;
+  ID3D11DeviceContext* dxD3D11Context;
+  ID3D11Texture2D* dxDX11Texture;
+  cl_command_queue _queue;
+
+  clGetDeviceIDsFromD3D11KHR_fn clGetDeviceIDsFromD3D11KHR;
+  clCreateFromD3D11BufferKHR_fn clCreateFromD3D11BufferKHR;
+  clCreateFromD3D11Texture2DKHR_fn clCreateFromD3D11Texture2DKHR;
+  clCreateFromD3D11Texture3DKHR_fn clCreateFromD3D11Texture3DKHR;
+  clEnqueueAcquireD3D11ObjectsKHR_fn clEnqueueAcquireD3D11ObjectsKHR;
+  clEnqueueReleaseD3D11ObjectsKHR_fn clEnqueueReleaseD3D11ObjectsKHR;
+  clGetPlaneFromImageAMD_fn clGetPlaneFromImageAMD;
+
+ private:
+  void ExtensionCheck();
+};
+
+#endif  // _OCL_DX11_COMMON_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.cpp b/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.cpp
new file mode 100644
index 0000000000..b9c156948c
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.cpp
@@ -0,0 +1,478 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLDX11YUY2.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+#define DXGI_FORMAT_NV12 103
+#define DXGI_FORMAT_P010 104
+#define GROUP_SIZE 256
+
+const static char strKernel[] =
+    "__constant sampler_t imageSampler = CLK_NORMALIZED_COORDS_FALSE | "
+    "CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \n"
+    "__kernel void image2imageCopy(                                            "
+    "                             \n"
+    "   __read_only image2d_t input,                                           "
+    "                             \n"
+    "   __write_only image2d_t output)                                         "
+    "                             \n"
+    "{                                                                         "
+    "                             \n"
+    "   int2 coord = (int2)(get_global_id(0), get_global_id(1));               "
+    "                             \n"
+    "   uint4 temp = read_imageui(input, imageSampler, coord);                 "
+    "                             \n"
+    "   write_imageui(output, coord, temp);                                    "
+    "                             \n"
+    "}                                                                         "
+    "                             \n";
+
+OCLDX11YUY2::OCLDX11YUY2() : OCLDX11Common() {
+  _numSubTests = 4;
+  blockSizeX = GROUP_SIZE;
+  blockSizeY = 1;
+}
+
+OCLDX11YUY2::~OCLDX11YUY2() {}
+
+void OCLDX11YUY2::open(unsigned int test, char *units, double &conversion,
+                       unsigned int deviceId) {
+  dxDX11Texture = 0;
+  clImage2DOut = 0;
+  _openTest = test;
+  // Initialize random number seed
+  srand((unsigned int)time(NULL));
+
+  OCLDX11Common::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+  if (!extensionsAvailable) {
+    return;
+  }
+
+  if (_openTest < 2) {
+    dxFormat = (DXGI_FORMAT)DXGI_FORMAT_NV12;
+    extensionsAvailable = formatSupported();
+    if (!extensionsAvailable) {
+      printf("DXGI_FORMAT_NV12 is required for this test!\n");
+      return;
+    }
+  } else {
+    dxFormat = (DXGI_FORMAT)DXGI_FORMAT_P010;
+    extensionsAvailable = formatSupported();
+    if (!extensionsAvailable) {
+      printf("DXGI_FORMAT_P010 is required for this test!\n");
+      return;
+    }
+  }
+
+  CompileKernel();
+  AllocateOpenCLImage();
+}
+
+void OCLDX11YUY2::run(void) {
+  if (_errorFlag) return;
+  if (!extensionsAvailable) return;
+
+  D3D11_TEXTURE2D_DESC Desc = {0};
+
+  Desc.ArraySize = 1;
+  Desc.BindFlags = 0;
+  Desc.Format = dxFormat;
+  Desc.Width = OCLDX11YUY2::WIDTH;
+  Desc.Height = OCLDX11YUY2::HEIGHT;
+  Desc.MipLevels = 1;
+  Desc.SampleDesc.Count = 1;
+  // Desc.MiscFlags=D3D11_RESOURCE_MISC_SHARED; //MM for fast GPU interop
+  // MM: these flags are incompatible with D3D11_RESOURCE_MISC_SHARED
+  // now we allocate texture without CPU access and if needed use temp texture
+  // (see FromSystemToDX11 and FromDX11ToSystem)
+
+  Desc.Usage = D3D11_USAGE_STAGING;
+  Desc.BindFlags = 0;
+  Desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE | D3D11_CPU_ACCESS_READ;
+
+  ID3D11Texture2D *pTextureTmp;
+  HRESULT hr = dxD3D11Device->CreateTexture2D(&Desc, NULL, &pTextureTmp);
+
+  // fill memory
+  D3D11_MAPPED_SUBRESOURCE LockedRectD11;
+  if (SUCCEEDED(hr)) {
+    hr =
+        dxD3D11Context->Map(pTextureTmp, 0, D3D11_MAP_WRITE, 0, &LockedRectD11);
+  }
+  if (SUCCEEDED(hr)) {
+    // fill memory with something
+    for (int y = 0; y < OCLDX11YUY2::HEIGHT; y++) {
+      BYTE *pLine = (BYTE *)LockedRectD11.pData + y * LockedRectD11.RowPitch;
+
+      BYTE *pLineUV = (BYTE *)LockedRectD11.pData + y * LockedRectD11.RowPitch +
+                      OCLDX11YUY2::HEIGHT * LockedRectD11.RowPitch;
+
+      for (int x = 0; x < OCLDX11YUY2::WIDTH; x++) {
+        *pLine++ = 0x7F;  // Y
+        if (y < OCLDX11YUY2::HEIGHT / 2 && x < OCLDX11YUY2::WIDTH / 2) {
+          *pLineUV++ = 0x1F;  // U
+          *pLineUV++ = 0x2F;  // V
+        }
+      }
+    }
+
+    dxD3D11Context->Unmap(pTextureTmp, 0);
+  }
+  Desc.BindFlags = D3D11_BIND_RENDER_TARGET | D3D11_BIND_SHADER_RESOURCE;
+  Desc.Usage = D3D11_USAGE_DEFAULT;
+  Desc.CPUAccessFlags = 0;
+  Desc.MiscFlags = (_openTest == 0)
+                       ? 0
+                       : D3D11_RESOURCE_MISC_SHARED;  // MM for fast GPU interop
+
+  hr = dxD3D11Device->CreateTexture2D(&Desc, NULL, &dxDX11Texture);
+
+  if (pTextureTmp != NULL) {
+    dxD3D11Context->CopySubresourceRegion(dxDX11Texture, 0, 0, 0, 0,
+                                          pTextureTmp, 0, NULL);
+    pTextureTmp->Release();
+  }
+  testInterop();
+}
+
+void OCLDX11YUY2::AllocateOpenCLImage() {
+  cl_int status = 0;
+
+  cl_image_format format{};
+  format.image_channel_order = CL_R;
+  format.image_channel_data_type =
+      (dxFormat == DXGI_FORMAT_NV12) ? CL_UNSIGNED_INT8 : CL_UNSIGNED_INT16;
+  cl_image_desc descr{};
+  descr.image_type = CL_MEM_OBJECT_IMAGE2D;
+  descr.image_width = WIDTH;
+  descr.image_height = HEIGHT + HEIGHT / 2;
+
+  clImage2DOut = clCreateImage(context_, CL_MEM_WRITE_ONLY, &format, &descr,
+                               NULL, &status);
+  CHECK_RESULT((status != CL_SUCCESS), "AllocateOpenCLImage() failed");
+}
+
+void OCLDX11YUY2::testInterop() {
+  // alloc
+  cl_int clStatus = 0;
+  cl_mem clImage2D =
+      clCreateFromD3D11Texture2DKHR(context_, 0, dxDX11Texture, 0, &clStatus);
+  CHECK_RESULT((clStatus != CL_SUCCESS),
+               "clCreateFromD3D11Texture2DKHR() failed");
+
+  // bring objects to the queue
+  cl_event clEvent = NULL;
+  clEnqueueAcquireD3D11ObjectsKHR(_queue, 1, &clImage2D, 0, NULL, &clEvent);
+  clStatus = clWaitForEvents(1, &clEvent);
+  clReleaseEvent(clEvent);
+
+  CopyOpenCLImage(clImage2D);
+  bool ImageReadWorks = CheckCLImage(clImage2D);
+  bool bKernelWorks = CheckCLImage(clImage2DOut);
+  CHECK_RESULT_NO_RETURN((ImageReadWorks != true),
+                         "CheckCLImage(clImage2D) failed");
+  CHECK_RESULT_NO_RETURN((bKernelWorks != true),
+                         "CheckCLImage(clImage2DOut) failed");
+
+  cl_mem planeY = clGetPlaneFromImageAMD(context_, clImage2D, 0, &clStatus);
+  CHECK_RESULT((clStatus != CL_SUCCESS),
+               "clGetPlaneFromImageAMD(context_,clImage2D,0,&clStatus) failed");
+
+  cl_mem planeUV = clGetPlaneFromImageAMD(context_, clImage2D, 1, &clStatus);
+  CHECK_RESULT((clStatus != CL_SUCCESS),
+               "clGetPlaneFromImageAMD(context_,clImage2D,1,&clStatus) failed");
+
+  bool ImageWorksY = CheckCLImageY(planeY);
+  bool ImageWorksUV = CheckCLImageUV(planeUV);
+
+  clReleaseMemObject(planeY);
+  clReleaseMemObject(planeUV);
+
+  // release
+  clEvent = NULL;
+  // release object from the queue
+  clStatus =
+      clEnqueueReleaseD3D11ObjectsKHR(_queue, 1, &clImage2D, 0, NULL, &clEvent);
+  clStatus = clWaitForEvents(1, &clEvent);
+  clReleaseEvent(clEvent);
+
+  // release mem object
+  clReleaseMemObject(clImage2D);
+
+  CHECK_RESULT_NO_RETURN((ImageWorksY != true), "CheckCLImageY() failed");
+  CHECK_RESULT_NO_RETURN((ImageWorksUV != true), "CheckCLImageUV() failed");
+}
+
+unsigned int OCLDX11YUY2::close(void) {
+  if (clImage2DOut) clReleaseMemObject(clImage2DOut);
+  if (dxDX11Texture) dxDX11Texture->Release();
+  return OCLDX11Common::close();
+}
+
+bool OCLDX11YUY2::CheckCLImage(cl_mem clImage) {
+  cl_int clStatus = 0;
+
+  size_t pitch = 0;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(pitch), &pitch, NULL);
+  pitch *= 2;
+
+  cl_image_format format;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
+
+  size_t height;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
+
+  CHECK_RESULT_NO_RETURN(height != (HEIGHT + HEIGHT / 2),
+                         "CheckCLImage: height!=(HEIGHT+HEIGHT/2)");
+
+  char *pTempBuffer = new char[(HEIGHT + HEIGHT / 2) * pitch];
+
+  size_t origin[] = {0, 0, 0};
+  size_t region[] = {WIDTH, HEIGHT + HEIGHT / 2, 1};
+  clStatus = clEnqueueReadImage(_queue, clImage, 1, origin, region, pitch, 0,
+                                pTempBuffer, 0, 0, 0);
+
+  ::clFinish(_queue);
+
+  // test
+
+  bool bBreak = false;
+  for (int y = 0; y < HEIGHT && !bBreak; y++) {
+    char *pLine = (char *)pTempBuffer + y * pitch;
+    char *pLineUV = (char *)pTempBuffer + y * pitch + HEIGHT * pitch;
+
+    for (int x = 0; x < WIDTH; x++) {
+      if (*pLine != 0x7F)  // Y
+      {
+        bBreak = true;
+        break;
+      }
+      pLine++;
+      if (y < HEIGHT / 2 && x < WIDTH / 2) {
+        if (*pLineUV != 0x1F)  // U
+        {
+          bBreak = true;
+          break;
+        }
+        pLineUV++;
+        if (*pLineUV != 0x2F)  // V
+        {
+          bBreak = true;
+          break;
+        }
+        pLineUV++;
+      }
+    }
+  }
+  delete[] pTempBuffer;
+
+  return !bBreak;
+}
+
+bool OCLDX11YUY2::CheckCLImageY(cl_mem clImage) {
+  cl_int clStatus = 0;
+
+  size_t pitch = 0;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(pitch), &pitch, NULL);
+  pitch *= 2;
+
+  cl_image_format format;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
+
+  size_t height;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
+
+  CHECK_RESULT_NO_RETURN(height != HEIGHT, "CheckCLImageY: height!=HEIGHT");
+
+  char *pTempBuffer = new char[HEIGHT * pitch];
+
+  size_t origin[] = {0, 0, 0};
+  size_t region[] = {WIDTH, HEIGHT, 1};
+  clStatus = clEnqueueReadImage(_queue, clImage, 1, origin, region, pitch, 0,
+                                pTempBuffer, 0, 0, 0);
+
+  ::clFinish(_queue);
+
+  // test
+
+  bool bBreak = false;
+  for (int y = 0; y < HEIGHT && !bBreak; y++) {
+    char *pLine = (char *)pTempBuffer + y * pitch;
+    for (int x = 0; x < WIDTH; x++) {
+      if (*pLine != 0x7F)  // Y
+      {
+        bBreak = true;
+        break;
+      }
+      pLine++;
+    }
+  }
+
+  delete[] pTempBuffer;
+
+  return !bBreak;
+}
+
+bool OCLDX11YUY2::CheckCLImageUV(cl_mem clImage) {
+  cl_int clStatus = 0;
+
+  size_t pitch = 0;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(pitch), &pitch, NULL);
+  pitch *= 2;
+  size_t width = 0;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
+
+  cl_image_format format;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
+
+  size_t height;
+  clStatus =
+      clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
+
+  CHECK_RESULT_NO_RETURN(height != HEIGHT / 2,
+                         "CheckCLImageUV: height!=HEIGHT/2");
+
+  char *pTempBuffer = new char[(HEIGHT / 2) * pitch];
+
+  size_t origin[] = {0, 0, 0};
+  size_t region[] = {WIDTH / 2, HEIGHT / 2, 1};
+  clStatus = clEnqueueReadImage(_queue, clImage, 1, origin, region, pitch, 0,
+                                pTempBuffer, 0, 0, 0);
+
+  ::clFinish(_queue);
+
+  bool bBreak = false;
+  for (int y = 0; y < HEIGHT / 2 && !bBreak; y++) {
+    char *pLineUV = (char *)pTempBuffer + y * pitch;
+    for (int x = 0; x < WIDTH / 2; x++) {
+      if (*pLineUV != 0x1F)  // U
+      {
+        bBreak = true;
+        break;
+      }
+      pLineUV++;
+      if (*pLineUV != 0x2F)  // V
+      {
+        bBreak = true;
+        break;
+      }
+      pLineUV++;
+    }
+  }
+  delete[] pTempBuffer;
+
+  return !bBreak;
+}
+
+void OCLDX11YUY2::CopyOpenCLImage(cl_mem clImageSrc) {
+  cl_int status = 0;
+
+  // Set appropriate arguments to the kernel2D
+
+  // input buffer image
+  status = clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clImageSrc);
+  CHECK_RESULT((status != CL_SUCCESS),
+               "CopyOpenCLImage() failed at "
+               "clSetKernelArg(kernel_,0,sizeof(cl_mem),&clImageSrc)");
+  status = clSetKernelArg(kernel_, 1, sizeof(cl_mem), &clImage2DOut);
+  CHECK_RESULT((status != CL_SUCCESS),
+               "CopyOpenCLImage() failed at "
+               "clSetKernelArg(kernel_,1,sizeof(cl_mem),&clImage2DOut)");
+
+  // Enqueue a kernel run call.
+  size_t global_work_offset[] = {0, 0};
+  size_t globalThreads[] = {WIDTH, HEIGHT + HEIGHT / 2};
+  size_t localThreads[] = {blockSizeX, blockSizeY};
+
+  // status =
+  // clEnqueueNDRangeKernel(_queue,kernel_,2,NULL,globalThreads,localThreads,0,NULL,0);
+  status = clEnqueueNDRangeKernel(_queue, kernel_, 2, NULL, globalThreads, NULL,
+                                  0, NULL, 0);
+  CHECK_RESULT((status != CL_SUCCESS),
+               "CopyOpenCLImage() failed at clEnqueueNDRangeKernel");
+
+  status = clFinish(_queue);
+  CHECK_RESULT((status != CL_SUCCESS), "CopyOpenCLImage() failed at clFinish");
+}
+
+void OCLDX11YUY2::CompileKernel() {
+  cl_int status = 0;
+
+  size_t kernelSize = sizeof(strKernel);
+  const char *strs = (const char *)&strKernel[0];
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strs,
+                                                 &kernelSize, &status);
+
+  status = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL,
+                                    NULL, NULL);
+  if (status != CL_SUCCESS) {
+    if (status == CL_BUILD_PROGRAM_FAILURE) {
+      cl_int logStatus;
+      size_t buildLogSize = 0;
+      logStatus = clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                        CL_PROGRAM_BUILD_LOG, buildLogSize,
+                                        NULL, &buildLogSize);
+      std::string buildLog;
+      buildLog.resize(buildLogSize);
+
+      logStatus = clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                        CL_PROGRAM_BUILD_LOG, buildLogSize,
+                                        &buildLog[0], NULL);
+      printf("%s", buildLog.c_str());
+    }
+    return;
+  }
+  // get a kernel object handle for a kernel with the given name
+  kernel_ = _wrapper->clCreateKernel(program_, "image2imageCopy", &status);
+
+  size_t kernel2DWorkGroupSize = 0;
+  status = clGetKernelWorkGroupInfo(kernel_, devices_[_deviceId],
+                                    CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t),
+                                    &kernel2DWorkGroupSize, 0);
+
+  if ((blockSizeX * blockSizeY) > kernel2DWorkGroupSize) {
+    if (blockSizeX > kernel2DWorkGroupSize) {
+      blockSizeX = kernel2DWorkGroupSize;
+      blockSizeY = 1;
+    }
+  }
+}
+
+bool OCLDX11YUY2::formatSupported() {
+  UINT supported = 0u;
+  dxD3D11Device->CheckFormatSupport(dxFormat, (UINT *)&supported);
+  return supported & D3D11_FORMAT_SUPPORT_TEXTURE2D;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.h b/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.h
new file mode 100644
index 0000000000..b8797fbeb5
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_DX11_YUY2_H_
+#define _OCL_DX11_YUY2_H_
+
+#include "OCLDX11Common.h"
+
+class OCLDX11YUY2 : public OCLDX11Common {
+ public:
+  OCLDX11YUY2();
+  virtual ~OCLDX11YUY2();
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ protected:
+  static const unsigned int WIDTH = 1280;
+  static const unsigned int HEIGHT = 720;
+
+  void testInterop();
+  void AllocateOpenCLImage();
+  bool CheckCLImage(cl_mem clImage);
+  bool CheckCLImageY(cl_mem clImage);
+  bool CheckCLImageUV(cl_mem clImage);
+  void CopyOpenCLImage(cl_mem clImageSrc);
+  void CompileKernel();
+  bool formatSupported();
+  void testFormat();
+
+  size_t blockSizeX; /**< Work-group size in x-direction */
+  size_t blockSizeY; /**< Work-group size in y-direction */
+  cl_mem clImage2DOut;
+  DXGI_FORMAT dxFormat;
+};
+
+#endif  // _OCL_DX11_YUY2_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/dx/TestList.cpp b/projects/clr/opencl/tests/ocltst/module/dx/TestList.cpp
new file mode 100644
index 0000000000..534d3f541c
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/dx/TestList.cpp
@@ -0,0 +1,52 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLTestListImp.h"
+
+//
+// Includes for tests
+//
+#ifdef ATI_OS_WIN
+#include "OCLDX11YUY2.h"
+#endif
+
+//
+//  Helper macro for adding tests
+//
+template <typename T>
+static void* dictionary_CreateTestFunc(void) {
+  return new T();
+}
+
+#define TEST(name) \
+  { #name, &dictionary_CreateTestFunc < name> }
+
+#ifdef ATI_OS_WIN
+
+TestEntry TestList[] = {TEST(OCLDX11YUY2)};
+
+unsigned int TestListCount = sizeof(TestList) / sizeof(TestList[0]);
+#else
+TestEntry TestList[] = {{"void", 0}};
+unsigned int TestListCount = 0;
+
+#endif
+unsigned int TestLibVersion = 0;
+const char* TestLibName = "ocldx";
diff --git a/projects/clr/opencl/tests/ocltst/module/dx/ocldx.exclude b/projects/clr/opencl/tests/ocltst/module/dx/ocldx.exclude
new file mode 100644
index 0000000000..39345e8fd7
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/dx/ocldx.exclude
@@ -0,0 +1 @@
+# all clear
diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBuffer.cpp b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBuffer.cpp
new file mode 100644
index 0000000000..fe94e49fd0
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBuffer.cpp
@@ -0,0 +1,220 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLBuffer.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+const static char* strKernel =
+    "__kernel void glbuffer_test( __global uint4 *source, __global uint4 "
+    "*glDest, __global uint4 *clDest)   \n"
+    "{                                                                         "
+    "                             \n"
+    "    int  tid = get_global_id(0);                                          "
+    "                             \n"
+    "    clDest[ tid ] = source[ tid ] + (uint4)(1);                           "
+    "                             \n"
+    "    glDest[ tid ] = source[ tid ] + (uint4)(2);                           "
+    "                             \n"
+    "}                                                                         "
+    "                             \n";
+
+OCLGLBuffer::OCLGLBuffer() : inGLBuffer_(0), outGLBuffer_(0) {
+  _numSubTests = 1;
+}
+
+OCLGLBuffer::~OCLGLBuffer() {}
+
+void OCLGLBuffer::open(unsigned int test, char* units, double& conversion,
+                       unsigned int deviceId) {
+  // Initialize random number seed
+  srand((unsigned int)time(NULL));
+
+  OCLGLCommon::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+
+  // Build the kernel
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateProgramWithSource()  failed (%d)", error_);
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_);
+
+  kernel_ = _wrapper->clCreateKernel(program_, "glbuffer_test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_);
+}
+
+void OCLGLBuffer::run(void) {
+  if (_errorFlag) {
+    return;
+  }
+
+  cl_mem buffer;
+  cl_uint4 inData[c_numOfElements] = {{{0}}};
+  cl_uint4 outDataCL[c_numOfElements] = {{{0}}};
+  cl_uint4 outDataGL[c_numOfElements] = {{{0}}};
+
+  // Initialize input data with random values
+  for (unsigned int i = 0; i < c_numOfElements; i++) {
+    for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) {
+      inData[i].s[j] = (unsigned int)rand();
+    }
+  }
+
+  // Generate and Bind in & out OpenGL buffers
+  glGenBuffers(1, &inGLBuffer_);
+  glGenBuffers(1, &outGLBuffer_);
+
+  glBindBuffer(GL_ARRAY_BUFFER, inGLBuffer_);
+  glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), inData,
+               GL_STATIC_DRAW);
+
+  glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer_);
+  glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), outDataGL,
+               GL_STATIC_DRAW);
+
+  glBindBuffer(GL_ARRAY_BUFFER, 0);
+  glFinish();
+
+  // Create input buffer from GL input buffer
+  buffer = _wrapper->clCreateFromGLBuffer(context_, CL_MEM_READ_ONLY,
+                                          inGLBuffer_, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Unable to create input GL buffer (%d)",
+               error_);
+  buffers_.push_back(buffer);
+
+  // Create output buffer from GL output buffer
+  buffer = _wrapper->clCreateFromGLBuffer(context_, CL_MEM_WRITE_ONLY,
+                                          outGLBuffer_, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Unable to create output GL buffer (%d)",
+               error_);
+  buffers_.push_back(buffer);
+
+  // Create a CL output buffer
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                    c_numOfElements * sizeof(cl_uint4), NULL,
+                                    &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed (%d)", error_);
+  buffers_.push_back(buffer);
+
+  // Assign args and execute
+  for (unsigned int i = 0; i < buffers_.size(); i++) {
+    error_ =
+        _wrapper->clSetKernelArg(kernel_, i, sizeof(cl_mem), &buffers()[i]);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
+                 error_);
+  }
+
+  error_ = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 2,
+                                               &buffers()[0], 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)",
+               error_);
+
+  size_t gws[1] = {c_numOfElements};
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, NULL, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed (%d)",
+               error_);
+
+  error_ = _wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 2,
+                                               &buffers()[0], 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReleaseGLObjects failed (%d)",
+               error_);
+
+  error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clFinish() failed (%d)", error_);
+
+  // Get the results from both CL and GL buffers
+  error_ = _wrapper->clEnqueueReadBuffer(
+      cmdQueues_[_deviceId], buffers()[2], CL_TRUE, 0,
+      c_numOfElements * sizeof(cl_uint4), outDataCL, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Unable to read output CL array! (%d)",
+               error_);
+
+  glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer_);
+  void* glMem = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY);
+  memcpy(outDataGL, glMem, c_numOfElements * sizeof(cl_uint4));
+  glUnmapBuffer(GL_ARRAY_BUFFER);
+
+  cl_uint4 expectedCL = {{0}};
+  cl_uint4 expectedGL = {{0}};
+
+  // Check output
+  for (unsigned int i = 0; i < c_numOfElements; ++i) {
+    // Calculate expected value in CL output buffer (input + 1)
+    expectedCL = inData[i];
+    expectedCL.s[0]++;
+    expectedCL.s[1]++;
+    expectedCL.s[2]++;
+    expectedCL.s[3]++;
+
+    // Calculate expected value in GL output buffer (input + 2)
+    expectedGL = inData[i];
+    expectedGL.s[0] += 2;
+    expectedGL.s[1] += 2;
+    expectedGL.s[2] += 2;
+    expectedGL.s[3] += 2;
+
+    // Compare expected output with actual data received
+    for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) {
+      CHECK_RESULT((outDataCL[i].s[j] != expectedCL.s[j]),
+                   "Element %d in CL output buffer is incorrect!\n\t \
+                         expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}",
+                   i, expectedCL.s[0], expectedCL.s[1], expectedCL.s[2],
+                   expectedCL.s[3], outDataCL[i].s[0], outDataCL[i].s[1],
+                   outDataCL[i].s[2], outDataCL[i].s[3]);
+      CHECK_RESULT((outDataGL[i].s[j] != expectedGL.s[j]),
+                   "Element %d in GL output buffer is incorrect!\n\t \
+                         expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}",
+                   i, expectedGL.s[0], expectedGL.s[1], expectedGL.s[2],
+                   expectedGL.s[3], outDataGL[i].s[0], outDataGL[i].s[1],
+                   outDataGL[i].s[2], outDataGL[i].s[3]);
+    }
+  }
+}
+
+unsigned int OCLGLBuffer::close(void) {
+  for (unsigned int i = 0; i < buffers().size(); ++i) {
+    clReleaseMemObject(buffers()[i]);
+  }
+  buffers_.clear();
+
+  // Delete GL in & out buffers
+  glBindBuffer(GL_ARRAY_BUFFER, 0);
+  glDeleteBuffers(1, &inGLBuffer_);
+  inGLBuffer_ = 0;
+  glDeleteBuffers(1, &outGLBuffer_);
+  outGLBuffer_ = 0;
+
+  return OCLGLCommon::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBuffer.h b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBuffer.h
new file mode 100644
index 0000000000..937acb61b1
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBuffer.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GL_BUFFER_H_
+#define _OCL_GL_BUFFER_H_
+
+#include "OCLGLCommon.h"
+
+class OCLGLBuffer : public OCLGLCommon {
+ public:
+  OCLGLBuffer();
+  virtual ~OCLGLBuffer();
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  static const unsigned int c_numOfElements = 1024;
+  GLuint inGLBuffer_;
+  GLuint outGLBuffer_;
+};
+
+#endif  // _OCL_GL_BUFFER_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBufferMultipleQueues.cpp b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBufferMultipleQueues.cpp
new file mode 100644
index 0000000000..dfff6262a1
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBufferMultipleQueues.cpp
@@ -0,0 +1,303 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLBufferMultipleQueues.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+const static char* strKernel =
+    "__kernel void glbuffer_test( __global uint4 *source, __global uint4 "
+    "*glDest, __global uint4 *clDest)   \n"
+    "{                                                                         "
+    "                             \n"
+    "    int  tid = get_global_id(0);                                          "
+    "                             \n"
+    "    glDest[ tid ] = source[ tid ] + (uint4)(2);                           "
+    "                             \n"
+    "    clDest[ tid ] = source[ tid ] + (uint4)(1);                           "
+    "                             \n"
+    "}                                                                         "
+    "                             \n";
+
+OCLGLBufferMultipleQueues::OCLGLBufferMultipleQueues() { _numSubTests = 1; }
+
+OCLGLBufferMultipleQueues::~OCLGLBufferMultipleQueues() {}
+
+void OCLGLBufferMultipleQueues::open(unsigned int test, char* units,
+                                     double& conversion,
+                                     unsigned int deviceId) {
+  // Initialize random number seed
+  srand((unsigned int)time(NULL));
+
+  OCLGLCommon::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+
+  // Create multiple queues for the device (first add already created queue in
+  // OCLGLCommon::open, then add a second queue)
+  deviceCmdQueues_.resize(QUEUES_PER_DEVICE_COUNT);
+  deviceCmdQueues_[0] = cmdQueues_[deviceId];
+  for (int queueIndex = 1; queueIndex < QUEUES_PER_DEVICE_COUNT; queueIndex++) {
+    cl_command_queue cmdQueue = _wrapper->clCreateCommandQueue(
+        context_, devices_[deviceId], 0, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed");
+    deviceCmdQueues_[queueIndex] = cmdQueue;
+  }
+
+  // Build the kernel
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateProgramWithSource()  failed (%d)", error_);
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_);
+
+  kernel_ = _wrapper->clCreateKernel(program_, "glbuffer_test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_);
+}
+
+void OCLGLBufferMultipleQueues::run(void) {
+  if (_errorFlag) {
+    return;
+  }
+
+  inputGLBufferPerQueue_.resize(QUEUES_PER_DEVICE_COUNT, NULL);
+  outputGLBufferPerQueue_.resize(QUEUES_PER_DEVICE_COUNT, NULL);
+  outputCLBufferPerQueue_.resize(QUEUES_PER_DEVICE_COUNT, NULL);
+
+  std::vector<std::vector<cl_uint4> > inData(
+      QUEUES_PER_DEVICE_COUNT);  // Input data per queue
+
+  inGLBufferIDs_.resize(QUEUES_PER_DEVICE_COUNT, 0);
+  outGLBufferIDs_.resize(QUEUES_PER_DEVICE_COUNT, 0);
+  for (int queueIndex = 0; queueIndex < QUEUES_PER_DEVICE_COUNT; queueIndex++) {
+    // Initialize input data with random values
+    inData[queueIndex].resize(BUFFER_ELEMENTS_COUNT);
+    for (int i = 0; i < BUFFER_ELEMENTS_COUNT; i++) {
+      for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) {
+        inData[queueIndex][i].s[j] = (unsigned int)rand();
+      }
+    }
+
+    // Generate and Bind in & out OpenGL buffers
+    glGenBuffers(1, &inGLBufferIDs_[queueIndex]);
+    glGenBuffers(1, &outGLBufferIDs_[queueIndex]);
+
+    glBindBuffer(GL_ARRAY_BUFFER, inGLBufferIDs_[queueIndex]);
+    glBufferData(GL_ARRAY_BUFFER, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4),
+                 &inData[queueIndex][0], GL_STATIC_DRAW);
+
+    glBindBuffer(GL_ARRAY_BUFFER, outGLBufferIDs_[queueIndex]);
+    glBufferData(GL_ARRAY_BUFFER, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4),
+                 NULL, GL_STATIC_DRAW);
+
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+    glFinish();
+
+    // Create input buffer from GL input buffer
+    inputGLBufferPerQueue_[queueIndex] = _wrapper->clCreateFromGLBuffer(
+        context_, CL_MEM_READ_ONLY, inGLBufferIDs_[queueIndex], &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "Unable to create input GL buffer (%d)", error_);
+
+    // Create output buffer from GL output buffer
+    outputGLBufferPerQueue_[queueIndex] = _wrapper->clCreateFromGLBuffer(
+        context_, CL_MEM_WRITE_ONLY, outGLBufferIDs_[queueIndex], &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "Unable to create output GL buffer (%d)", error_);
+
+    // Create a CL output buffer
+    outputCLBufferPerQueue_[queueIndex] = _wrapper->clCreateBuffer(
+        context_, CL_MEM_WRITE_ONLY, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4),
+        NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed (%d)",
+                 error_);
+  }
+
+  for (int queueIndex = 0; queueIndex < QUEUES_PER_DEVICE_COUNT; queueIndex++) {
+    // Assign arguments to kernel according to queue index
+    error_ = _wrapper->clSetKernelArg(
+        kernel_, 0, sizeof(cl_mem),
+        &inputGLBufferPerQueue_[queueIndex]);  // Input source
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
+                 error_);
+    error_ = _wrapper->clSetKernelArg(
+        kernel_, 1, sizeof(cl_mem),
+        &outputGLBufferPerQueue_[queueIndex]);  // Output glDest
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
+                 error_);
+    error_ = _wrapper->clSetKernelArg(
+        kernel_, 2, sizeof(cl_mem),
+        &outputCLBufferPerQueue_[queueIndex]);  // Output clDest
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
+                 error_);
+
+    // Acquire input GL buffer
+    error_ = _wrapper->clEnqueueAcquireGLObjects(
+        deviceCmdQueues_[queueIndex], 1, &inputGLBufferPerQueue_[queueIndex], 0,
+        NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)",
+                 error_);
+
+    // Acquire output GL buffer
+    error_ = _wrapper->clEnqueueAcquireGLObjects(
+        deviceCmdQueues_[queueIndex], 1, &outputGLBufferPerQueue_[queueIndex],
+        0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)",
+                 error_);
+
+    // Enqueue the kernel
+    size_t gws[1] = {BUFFER_ELEMENTS_COUNT};
+    error_ =
+        _wrapper->clEnqueueNDRangeKernel(deviceCmdQueues_[queueIndex], kernel_,
+                                         1, NULL, gws, NULL, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed (%d)",
+                 error_);
+
+    // Release input GL buffer
+    error_ = _wrapper->clEnqueueReleaseGLObjects(
+        deviceCmdQueues_[queueIndex], 1, &inputGLBufferPerQueue_[queueIndex], 0,
+        NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "clEnqueueReleaseGLObjects failed (%d)", error_);
+
+    // Release output GL buffer
+    error_ = _wrapper->clEnqueueReleaseGLObjects(
+        deviceCmdQueues_[queueIndex], 1, &outputGLBufferPerQueue_[queueIndex],
+        0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "clEnqueueReleaseGLObjects failed (%d)", error_);
+
+    // Flush commands in order to trigger the operations
+    error_ = _wrapper->clFlush(deviceCmdQueues_[queueIndex]);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clFlush() failed (%d)", error_);
+  }
+
+  for (int queueIndex = 0; queueIndex < QUEUES_PER_DEVICE_COUNT; queueIndex++) {
+    // Get the results from CL buffer (in a synchronous manner)
+    cl_uint4 outDataCL[BUFFER_ELEMENTS_COUNT];
+    error_ = _wrapper->clEnqueueReadBuffer(
+        deviceCmdQueues_[queueIndex], outputCLBufferPerQueue_[queueIndex],
+        CL_TRUE, 0, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4), outDataCL, 0,
+        NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "Unable to read output CL array! (%d)",
+                 error_);
+
+    cl_uint4 outDataGL[BUFFER_ELEMENTS_COUNT] = {{{0}}};
+    glBindBuffer(GL_ARRAY_BUFFER, outGLBufferIDs_[queueIndex]);  // why again
+    void* glMem = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY);
+    memcpy(outDataGL, glMem, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4));
+    glUnmapBuffer(GL_ARRAY_BUFFER);
+
+    cl_uint4 expectedCL = {{0}};
+    cl_uint4 expectedGL = {{0}};
+
+    // Check output
+    for (int i = 0; i < BUFFER_ELEMENTS_COUNT; ++i) {
+      // Calculate expected value in CL output buffer (input + 1)
+      expectedCL = inData[queueIndex][i];
+      expectedCL.s[0]++;
+      expectedCL.s[1]++;
+      expectedCL.s[2]++;
+      expectedCL.s[3]++;
+
+      // Calculate expected value in GL output buffer (input + 2)
+      expectedGL = inData[queueIndex][i];
+      expectedGL.s[0] += 2;
+      expectedGL.s[1] += 2;
+      expectedGL.s[2] += 2;
+      expectedGL.s[3] += 2;
+
+      // Compare expected output with actual data received
+      for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) {
+        CHECK_RESULT((outDataCL[i].s[j] != expectedCL.s[j]),
+                     "Element %d in CL output buffer is incorrect!\n\t \
+							 expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}",
+                     i, expectedCL.s[0], expectedCL.s[1], expectedCL.s[2],
+                     expectedCL.s[3], outDataCL[i].s[0], outDataCL[i].s[1],
+                     outDataCL[i].s[2], outDataCL[i].s[3]);
+        CHECK_RESULT((outDataGL[i].s[j] != expectedGL.s[j]),
+                     "Element %d in GL output buffer is incorrect!\n\t \
+							 expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}",
+                     i, expectedGL.s[0], expectedGL.s[1], expectedGL.s[2],
+                     expectedGL.s[3], outDataGL[i].s[0], outDataGL[i].s[1],
+                     outDataGL[i].s[2], outDataGL[i].s[3]);
+      }
+    }
+  }
+}
+
+unsigned int OCLGLBufferMultipleQueues::close(void) {
+  // Release cl buffers (must be done before releasing the associated GL
+  // buffers)
+  for (int bufferIndex = 0; bufferIndex < (int)inputGLBufferPerQueue_.size();
+       bufferIndex++) {
+    error_ = _wrapper->clReleaseMemObject(inputGLBufferPerQueue_[bufferIndex]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseMemObject() failed");
+  }
+
+  for (int bufferIndex = 0; bufferIndex < (int)outputGLBufferPerQueue_.size();
+       bufferIndex++) {
+    error_ = _wrapper->clReleaseMemObject(outputGLBufferPerQueue_[bufferIndex]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseMemObject() failed");
+  }
+
+  for (int bufferIndex = 0; bufferIndex < (int)outputCLBufferPerQueue_.size();
+       bufferIndex++) {
+    error_ = _wrapper->clReleaseMemObject(outputCLBufferPerQueue_[bufferIndex]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseMemObject() failed");
+  }
+
+  // Delete GL in & out buffers
+  glBindBuffer(GL_ARRAY_BUFFER, 0);
+  if (!inGLBufferIDs_.empty()) {
+    glDeleteBuffers((int)inGLBufferIDs_.size(), &inGLBufferIDs_[0]);
+  }
+
+  if (!outGLBufferIDs_.empty()) {
+    glDeleteBuffers((int)outGLBufferIDs_.size(), &outGLBufferIDs_[0]);
+  }
+
+  // Release queues created by open method, the first queue per device is
+  // released by base class
+  for (int queueIndex = 1; queueIndex < (int)deviceCmdQueues_.size();
+       queueIndex++) {
+    error_ = _wrapper->clReleaseCommandQueue(deviceCmdQueues_[queueIndex]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseCommandQueue() failed");
+  }
+  deviceCmdQueues_.clear();
+
+  return OCLGLCommon::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBufferMultipleQueues.h b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBufferMultipleQueues.h
new file mode 100644
index 0000000000..97a65e17ee
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBufferMultipleQueues.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GL_BUFFER_MULTIPLE_QUEUES_H_
+#define _OCL_GL_BUFFER_MULTIPLE_QUEUES_H_
+
+#include "OCLGLCommon.h"
+
+class OCLGLBufferMultipleQueues : public OCLGLCommon {
+ public:
+  OCLGLBufferMultipleQueues();
+  virtual ~OCLGLBufferMultipleQueues();
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  static const int BUFFER_ELEMENTS_COUNT = 1024;
+  static const int QUEUES_PER_DEVICE_COUNT = 2;
+  std::vector<cl_command_queue>
+      deviceCmdQueues_;  // Multiple queues per device (single device)
+  std::vector<cl_mem> inputGLBufferPerQueue_;   // Input GL buffer per queue
+  std::vector<cl_mem> outputGLBufferPerQueue_;  // Output GL buffer per queue
+  std::vector<cl_mem> outputCLBufferPerQueue_;  // Input CL buffer per queue
+  std::vector<GLuint> inGLBufferIDs_;           // Input GL buffers IDs
+  std::vector<GLuint> outGLBufferIDs_;          // Output GL buffers IDs
+};
+
+#endif  // _OCL_GL_BUFFER_MULTIPLE_QUEUES_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthBuffer.cpp b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthBuffer.cpp
new file mode 100644
index 0000000000..14a441c80a
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthBuffer.cpp
@@ -0,0 +1,270 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLDepthBuffer.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+const static char* strKernel =
+    "#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+    "__kernel void gldepths_test( __global float *output, read_only  image2d_t "
+    "source, sampler_t sampler){   \n"
+    "    int  tidX = get_global_id(0);\n"
+    "    int  tidY = get_global_id(1);\n"
+    "    float4 value = read_imagef( source, sampler, (int2)( tidX, tidY ) );\n"
+    "    output[ tidY * get_image_width( source ) + tidX ] =  value.z;\n"
+    "}\n";
+
+OCLGLDepthBuffer::OCLGLDepthBuffer()
+    : glDepthBuffer_(0),
+      frameBufferOBJ_(0),
+      colorBuffer_(0),
+      clOutputBuffer_(0),
+      clDepth_(0),
+      clSampler_(0),
+      pGLOutput_(0),
+      pCLOutput_(0),
+      extensionSupported_(false) {
+  _numSubTests = 2;
+  _currentTest = 0;
+}
+
+OCLGLDepthBuffer::~OCLGLDepthBuffer() {}
+
+void OCLGLDepthBuffer::open(unsigned int test, char* units, double& conversion,
+                            unsigned int deviceId) {
+  OCLGLCommon::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+
+  char* pExtensions = (char*)malloc(8192);
+  size_t returnSize;
+  _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 8192,
+                            pExtensions, &returnSize);
+
+  // if extension if not supported
+  if (!strstr(pExtensions, "cl_khr_gl_depth_images")) {
+    printf("skipping test depth interop not supported\n");
+    free(pExtensions);
+    return;
+  }
+  free(pExtensions);
+  extensionSupported_ = true;
+
+  _currentTest = test;
+
+  // Build the kernel
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateProgramWithSource()  failed (%d)", error_);
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_);
+
+  kernel_ = _wrapper->clCreateKernel(program_, "gldepths_test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_);
+}
+
+void OCLGLDepthBuffer::run(void) {
+  if (_errorFlag || !extensionSupported_) {
+    return;
+  }
+  bool retVal;
+  switch (_currentTest) {
+    case 0:
+      retVal = testDepthRead(GL_DEPTH_COMPONENT32F, GL_DEPTH_ATTACHMENT);
+      break;
+    case 1:
+      retVal = testDepthRead(GL_DEPTH_COMPONENT16, GL_DEPTH_ATTACHMENT);
+      break;
+    case 2:
+      retVal = testDepthRead(GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL_ATTACHMENT);
+      break;
+    case 3:
+      retVal = testDepthRead(GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL_ATTACHMENT);
+      break;
+    default:
+      CHECK_RESULT(true, "unsupported test number\n");
+  }
+  CHECK_RESULT((retVal != true), "cl-gl depth test failed ");
+}
+
+bool OCLGLDepthBuffer::testDepthRead(GLint internalFormat,
+                                     GLenum attachmentType) {
+  cl_int error;
+  size_t dimSizes[] = {c_dimSize, c_dimSize};
+
+  unsigned int bufferSize = c_dimSize * c_dimSize * 4;
+  bool retVal = false;
+
+  pGLOutput_ = (float*)malloc(bufferSize);
+  pCLOutput_ = (float*)malloc(bufferSize);
+  // create Frame buffer object
+  glGenFramebuffers(1, &frameBufferOBJ_);
+
+  // create   textures
+  glGenTextures(1, &colorBuffer_);
+  glEnable(GL_TEXTURE_2D);
+  glBindTexture(GL_TEXTURE_2D, colorBuffer_);
+  glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, c_dimSize, c_dimSize, 0, GL_RGBA,
+               GL_UNSIGNED_BYTE, 0);
+  glBindTexture(GL_TEXTURE_2D, 0);
+  // create a renderbuffer for the depth/stencil buffer
+  glGenRenderbuffers(1, &glDepthBuffer_);
+  glBindRenderbuffer(GL_RENDERBUFFER, glDepthBuffer_);
+  glRenderbufferStorage(GL_RENDERBUFFER, internalFormat, c_dimSize, c_dimSize);
+
+  //
+  glBindFramebuffer(GL_FRAMEBUFFER, frameBufferOBJ_);
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, colorBuffer_, 0);
+  glFramebufferRenderbuffer(GL_FRAMEBUFFER, attachmentType, GL_RENDERBUFFER,
+                            glDepthBuffer_);
+
+  GLenum status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
+  if (GL_FRAMEBUFFER_COMPLETE != status) {
+    return false;
+  }
+  // set up gl state machine
+  glViewport(0, 0, c_dimSize, c_dimSize);  // Reset The Current Viewport
+  glMatrixMode(GL_PROJECTION);             // Select The Projection Matrix
+  glLoadIdentity();                        // Reset The Projection Matrix
+  gluPerspective(30.0f, (GLfloat)c_dimSize / (GLfloat)c_dimSize, 0.1f, 100.0f);
+  glMatrixMode(GL_MODELVIEW);  // Select The Modelview Matrix
+  glLoadIdentity();
+  glEnable(GL_DEPTH_TEST);
+  // The Type Of Depth Testing To Do
+  glClear(GL_COLOR_BUFFER_BIT |
+          GL_DEPTH_BUFFER_BIT);     // Clear Screen And Depth Buffer
+  glBegin(GL_QUADS);                // Draw A Quad
+  glVertex3f(-1.0f, 1.0f, -6.0f);   // Top Left
+  glVertex3f(1.0f, 1.0f, -6.0f);    // Top Right
+  glVertex3f(1.0f, -1.0f, -3.0f);   // Bottom Right
+  glVertex3f(-1.0f, -1.0f, -3.0f);  // Bottom Left
+  glEnd();
+
+  glFinish();
+
+  clDepth_ = _wrapper->clCreateFromGLRenderbuffer(context_, CL_MEM_READ_WRITE,
+                                                  glDepthBuffer_, &error);
+  if (CL_SUCCESS != error) {
+    printf("clCreateFromGLRenderbuffer failed\n");
+    return false;
+  }
+
+  clOutputBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
+                                             bufferSize, NULL, &error);
+  if (CL_SUCCESS != error) return false;
+
+  clSampler_ = _wrapper->clCreateSampler(context_, CL_FALSE, CL_ADDRESS_NONE,
+                                         CL_FILTER_NEAREST, &error);
+  if (CL_SUCCESS != error) return false;
+
+  error = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 1,
+                                              &clDepth_, 0, NULL, NULL);
+
+  _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clOutputBuffer_);
+
+  _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), &clDepth_);
+
+  _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_sampler), &clSampler_);
+
+  _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, NULL,
+                                   dimSizes, NULL, 0, NULL, NULL);
+
+  _wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 1, &clDepth_, 0,
+                                      NULL, NULL);
+
+  _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], clOutputBuffer_, CL_TRUE,
+                                0, bufferSize, pCLOutput_, 0, NULL, NULL);
+
+  glReadPixels(0, 0, c_dimSize, c_dimSize, GL_DEPTH_COMPONENT, GL_FLOAT,
+               pGLOutput_);
+
+  // test that both resources are identical.
+  if (0 == memcmp(pGLOutput_, pCLOutput_, bufferSize)) {
+    retVal = true;  // test successful
+  } else {
+    printf("expected results is different from actual results\n");
+    dumpBuffer(pGLOutput_, "GLDepth.csv", c_dimSize);
+    dumpBuffer(pCLOutput_, "CLDepth.csv", c_dimSize);
+  }
+
+  return retVal;
+}
+
+unsigned int OCLGLDepthBuffer::close(void) {
+  if (pGLOutput_) {
+    free(pGLOutput_);
+    pGLOutput_ = NULL;
+  }
+
+  if (pCLOutput_) {
+    free(pCLOutput_);
+    pCLOutput_ = NULL;
+  }
+
+  clReleaseMemObject(clDepth_);
+  clReleaseMemObject(clOutputBuffer_);
+  clReleaseSampler(clSampler_);
+  // unbind the texture and frame buffer.
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, 0, 0);
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, 0, 0);
+  glBindFramebuffer(GL_FRAMEBUFFER, 0);
+  // clean gl resources
+  glDeleteFramebuffers(1, &frameBufferOBJ_);
+  frameBufferOBJ_ = 0;
+  glDeleteTextures(1, &colorBuffer_);
+  colorBuffer_ = 0;
+  glDeleteTextures(1, &glDepthBuffer_);
+  glDepthBuffer_ = 0;
+
+  return OCLGLCommon::close();
+}
+
+// helper functions
+unsigned int OCLGLDepthBuffer::formatToSize(GLint internalFormat) {
+  switch (internalFormat) {
+    case GL_DEPTH_COMPONENT32F:
+      return 4;
+      break;
+    case GL_DEPTH_COMPONENT16:
+      return 2;
+      break;
+    case GL_DEPTH24_STENCIL8:
+      return 4;
+      break;
+    case GL_DEPTH32F_STENCIL8:
+      return 8;
+      break;
+    default:
+      return 0;
+  }
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthBuffer.h b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthBuffer.h
new file mode 100644
index 0000000000..b8a3d46ad2
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthBuffer.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GL_DEPTH_BUFFER_H_
+#define _OCL_GL_DEPTH_BUFFER_H_
+
+#include "OCLGLCommon.h"
+
+class OCLGLDepthBuffer : public OCLGLCommon {
+ public:
+  OCLGLDepthBuffer();
+  virtual ~OCLGLDepthBuffer();
+  static const unsigned int c_dimSize = 128;
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  ////////////////////
+  // test functions //
+  ////////////////////
+  bool testDepthRead(GLint internalFormat, GLenum attachmentType);
+  unsigned int _currentTest;
+  /////////////////////
+  // private members //
+  /////////////////////
+  // GL resource identifiers
+  GLuint glDepthBuffer_;
+  GLuint frameBufferOBJ_;
+  GLuint colorBuffer_;
+
+  // CL identifiers
+  cl_mem clOutputBuffer_;
+  cl_mem clDepth_;
+  cl_sampler clSampler_;
+
+  // pointers to buffers
+  float* pGLOutput_;
+  float* pCLOutput_;
+  bool extensionSupported_;
+  //////////////////////////////
+  // private helper functions //
+  //////////////////////////////
+  // returns element size in bytes.
+  static unsigned int formatToSize(GLint internalFormat);
+};
+
+#endif  // _OCL_GL_BUFFER_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthTex.cpp b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthTex.cpp
new file mode 100644
index 0000000000..38d8099708
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthTex.cpp
@@ -0,0 +1,278 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLDepthTex.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+const static char* strKernel =
+    "__kernel void gldepths_test( __global float *output, read_only image2d_t "
+    "source, sampler_t sampler){   \n"
+    "    int  tidX = get_global_id(0);\n"
+    "    int  tidY = get_global_id(1);\n"
+    "    float4 value = read_imagef( source, sampler, (int2)( tidX, tidY ) );\n"
+    "    output[ tidY * get_image_width( source ) + tidX ] =  value.z;\n"
+    "}\n";
+
+OCLGLDepthTex::OCLGLDepthTex()
+    : glDepthBuffer_(0),
+      frameBufferOBJ_(0),
+      colorBuffer_(0),
+      clOutputBuffer_(0),
+      clDepth_(0),
+      clSampler_(0),
+      pGLOutput_(0),
+      pCLOutput_(0),
+      extensionSupported_(false) {
+  _numSubTests = 8;
+  _currentTest = 0;
+}
+
+OCLGLDepthTex::~OCLGLDepthTex() {}
+
+void OCLGLDepthTex::open(unsigned int test, char* units, double& conversion,
+                         unsigned int deviceId) {
+  OCLGLCommon::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+
+  char* pExtensions = (char*)malloc(8192);
+  size_t returnSize;
+  _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 8192,
+                            pExtensions, &returnSize);
+
+  // if extension if not supported
+  if (!strstr(pExtensions, "cl_khr_gl_depth_images")) {
+    free(pExtensions);
+    printf("skipping test depth interop not supported\n");
+    return;
+  }
+  free(pExtensions);
+  extensionSupported_ = true;
+
+  static const char* OpenCL20Kernel = "-cl-std=CL2.0";
+  const char* options = OpenCL20Kernel;
+  if (test < 4) {
+    options = NULL;
+  }
+  _currentTest = test % 4;
+
+  // Build the kernel
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateProgramWithSource()  failed (%d)", error_);
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], options,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_);
+
+  kernel_ = _wrapper->clCreateKernel(program_, "gldepths_test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_);
+}
+
+void OCLGLDepthTex::run(void) {
+  if (_errorFlag || !extensionSupported_) {
+    return;
+  }
+  bool retVal;
+  switch (_currentTest) {
+    case 0:
+      retVal = testDepthRead(GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL,
+                             GL_UNSIGNED_INT_24_8);
+      break;
+    case 1:
+      retVal =
+          testDepthRead(GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_FLOAT);
+      break;
+    case 2:
+      retVal =
+          testDepthRead(GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT);
+      break;
+    case 3:
+      retVal = testDepthRead(GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL,
+                             GL_FLOAT_32_UNSIGNED_INT_24_8_REV);
+      break;
+    default:
+      CHECK_RESULT(true, "unsupported test number\n");
+  }
+  CHECK_RESULT((retVal != true), "cl-gl depth test failed ");
+}
+
+bool OCLGLDepthTex::testDepthRead(GLint internalFormat, GLenum format,
+                                  GLenum type) {
+  const unsigned int bufferSize = c_dimSize * c_dimSize * 4;
+
+  pGLOutput_ = (float*)malloc(bufferSize);
+  pCLOutput_ = (float*)malloc(bufferSize);
+  size_t dimSizes[] = {c_dimSize, c_dimSize};
+
+  bool retVal = false;
+  // create Frame buffer object
+  glGenFramebuffers(1, &frameBufferOBJ_);
+  glBindFramebuffer(GL_FRAMEBUFFER, frameBufferOBJ_);
+
+  // create   textures
+  glGenTextures(1, &colorBuffer_);
+  glBindTexture(GL_TEXTURE_2D, colorBuffer_);
+
+  glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, c_dimSize, c_dimSize, 0, GL_RGBA,
+               GL_UNSIGNED_BYTE, 0);
+
+  glGenTextures(1, &glDepthBuffer_);
+  glBindTexture(GL_TEXTURE_2D, glDepthBuffer_);
+  glTexImage2D(GL_TEXTURE_2D, 0, internalFormat, c_dimSize, c_dimSize, 0,
+               format, type, 0);
+  GLint glError = glGetError();
+  //
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, colorBuffer_, 0);
+
+  if (GL_DEPTH_COMPONENT == format) {
+    glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, glDepthBuffer_,
+                         0);
+  } else {
+    glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT,
+                         glDepthBuffer_, 0);
+  }
+
+  glBindFramebuffer(GL_FRAMEBUFFER, frameBufferOBJ_);
+
+  GLenum status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
+  if (GL_FRAMEBUFFER_COMPLETE != status) {
+    printf("frame buffer incomplete!\n");
+    return false;
+  }
+  // set up gl state machine
+  glViewport(0, 0, c_dimSize, c_dimSize);  // Reset The Current Viewport
+  glMatrixMode(GL_PROJECTION);             // Select The Projection Matrix
+  glLoadIdentity();                        // Reset The Projection Matrix
+  gluPerspective(30.0f, (GLfloat)c_dimSize / (GLfloat)c_dimSize, 0.1f, 100.0f);
+  glMatrixMode(GL_MODELVIEW);  // Select The Modelview Matrix
+  glLoadIdentity();
+  glEnable(GL_DEPTH_TEST);
+  glBindFramebuffer(GL_FRAMEBUFFER, frameBufferOBJ_);
+
+  cl_int error;
+
+  clOutputBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
+                                             bufferSize, NULL, &error);
+  if (CL_SUCCESS != error) return false;
+
+  clSampler_ = _wrapper->clCreateSampler(context_, CL_FALSE, CL_ADDRESS_NONE,
+                                         CL_FILTER_NEAREST, &error);
+  if (CL_SUCCESS != error) return false;
+
+  clDepth_ = _wrapper->clCreateFromGLTexture(
+      context_, CL_MEM_READ_ONLY, GL_TEXTURE_2D, 0, glDepthBuffer_, &error);
+  if (CL_SUCCESS != error) return false;
+
+  for (int i = 0; i < 3; ++i) {
+    // The Type Of Depth Testing To Do
+    glClear(GL_COLOR_BUFFER_BIT |
+            GL_DEPTH_BUFFER_BIT);  // Clear Screen And Depth Buffer
+
+    const float zValues[3][2] = {
+        {-6.f, -3.f},
+        {-5.f, -2.f},
+        {-4.f, -1.f},
+    };
+
+    glBegin(GL_QUADS);                        // Draw A Quad
+    glVertex3f(-1.0f, 1.0f, zValues[i][0]);   // Top Left
+    glVertex3f(1.0f, 1.0f, zValues[i][0]);    // Top Right
+    glVertex3f(1.0f, -1.0f, zValues[i][1]);   // Bottom Right
+    glVertex3f(-1.0f, -1.0f, zValues[i][1]);  // Bottom Left
+    glEnd();
+
+    glFinish();
+
+    error = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 1,
+                                                &clDepth_, 0, NULL, NULL);
+
+    _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clOutputBuffer_);
+
+    _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), &clDepth_);
+
+    _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_sampler), &clSampler_);
+
+    _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, NULL,
+                                     dimSizes, NULL, 0, NULL, NULL);
+
+    _wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 1, &clDepth_, 0,
+                                        NULL, NULL);
+
+    _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], clOutputBuffer_,
+                                  CL_TRUE, 0, bufferSize, pCLOutput_, 0, NULL,
+                                  NULL);
+
+    glReadPixels(0, 0, c_dimSize, c_dimSize, GL_DEPTH_COMPONENT, GL_FLOAT,
+                 pGLOutput_);
+
+    // test that both resources are identical.
+    if (0 == memcmp(pGLOutput_, pCLOutput_, bufferSize)) {
+      retVal = true;  // test successful
+    } else {
+      printf("expected results is different from actual results\n");
+      dumpBuffer(pGLOutput_, "GLDepth.csv", c_dimSize);
+      dumpBuffer(pCLOutput_, "clDepth_.csv", c_dimSize);
+    }
+  }
+
+  return retVal;
+}
+
+unsigned int OCLGLDepthTex::close(void) {
+  if (pGLOutput_) {
+    free(pGLOutput_);
+    pGLOutput_ = NULL;
+  }
+
+  if (pCLOutput_) {
+    free(pCLOutput_);
+    pCLOutput_ = NULL;
+  }
+
+  clReleaseMemObject(clDepth_);
+  clReleaseMemObject(clOutputBuffer_);
+  clReleaseSampler(clSampler_);
+  // unbind the texture and frame buffer.
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, 0, 0);
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, 0, 0);
+  glBindFramebuffer(GL_FRAMEBUFFER, 0);
+  // clean gl resources
+  glDeleteFramebuffers(1, &frameBufferOBJ_);
+  frameBufferOBJ_ = 0;
+  glDeleteTextures(1, &colorBuffer_);
+  colorBuffer_ = 0;
+  glDeleteTextures(1, &glDepthBuffer_);
+  glDepthBuffer_ = 0;
+
+  return OCLGLCommon::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthTex.h b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthTex.h
new file mode 100644
index 0000000000..07be55d678
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthTex.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GL_DEPTH_TEX_H_
+#define _OCL_GL_DEPTH_TEX_H_
+
+#include "OCLGLCommon.h"
+
+class OCLGLDepthTex : public OCLGLCommon {
+ public:
+  OCLGLDepthTex();
+  virtual ~OCLGLDepthTex();
+  static const unsigned int c_dimSize = 128;
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  ////////////////////
+  // test functions //
+  ////////////////////
+  bool testDepthRead(GLint internalFormat, GLenum format, GLenum type);
+  unsigned int _currentTest;
+
+  /////////////////////
+  // private members //
+  /////////////////////
+  // GL resource identifiers
+  GLuint glDepthBuffer_;
+  GLuint frameBufferOBJ_;
+  GLuint colorBuffer_;
+
+  // CL identifiers
+  cl_mem clOutputBuffer_;
+  cl_mem clDepth_;
+  cl_sampler clSampler_;
+
+  // pointers to buffers
+  float* pGLOutput_;
+  float* pCLOutput_;
+  bool extensionSupported_;
+};
+
+#endif  // _OCL_GL_BUFFER_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLFenceSync.cpp b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLFenceSync.cpp
new file mode 100644
index 0000000000..9d16495e1b
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLFenceSync.cpp
@@ -0,0 +1,481 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLFenceSync.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+#include "Timer.h"
+#ifndef WIN_OS
+#include <GL/glx.h>
+#endif
+
+const static char *strKernel =
+    "__kernel void glmulticontext_test( __global uint4 *source, __global uint4 "
+    "*dest)   \n"
+    "{                                                                         "
+    "         \n"
+    "    int  tid = get_global_id(0);                                          "
+    "         \n"
+    "    dest[ tid ] = source [ tid ] + (uint4)(1);                            "
+    "         \n"
+    "}                                                                         "
+    "         \n";
+
+OCLGLFenceSync::OCLGLFenceSync() {
+  memset(contextData_, 0, sizeof(contextData_));
+  _numSubTests = 2;
+}
+
+OCLGLFenceSync::~OCLGLFenceSync() {}
+
+#ifdef WIN_OS
+typedef GLsync(__stdcall *glFenceSyncPtr)(GLenum condition, GLbitfield flags);
+typedef bool(__stdcall *glIsSyncPtr)(GLsync sync);
+typedef void(__stdcall *glDeleteSyncPtr)(GLsync sync);
+typedef GLenum(__stdcall *glClientWaitSyncPtr)(GLsync sync, GLbitfield flags,
+                                               GLuint64 timeout);
+typedef void(__stdcall *glWaitSyncPtr)(GLsync sync, GLbitfield flags,
+                                       GLuint64 timeout);
+typedef void(__stdcall *glGetInteger64vPtr)(GLenum pname, GLint64 *params);
+typedef void(__stdcall *glGetSyncivPtr)(GLsync sync, GLenum pname,
+                                        GLsizei bufSize, GLsizei *length,
+                                        GLint *values);
+#else
+typedef GLsync (*glFenceSyncPtr)(GLenum condition, GLbitfield flags);
+typedef bool (*glIsSyncPtr)(GLsync sync);
+typedef void (*glDeleteSyncPtr)(GLsync sync);
+typedef GLenum (*glClientWaitSyncPtr)(GLsync sync, GLbitfield flags,
+                                      GLuint64 timeout);
+typedef void (*glWaitSyncPtr)(GLsync sync, GLbitfield flags, GLuint64 timeout);
+typedef void (*glGetInteger64vPtr)(GLenum pname, GLint64 *params);
+typedef void (*glGetSyncivPtr)(GLsync sync, GLenum pname, GLsizei bufSize,
+                               GLsizei *length, GLint *values);
+#endif
+
+typedef struct __GLsync *GLsync;
+
+glFenceSyncPtr glFenceSyncFunc;
+
+glIsSyncPtr glIsSyncFunc;
+
+glDeleteSyncPtr glDeleteSyncFunc;
+
+glClientWaitSyncPtr glClientWaitSyncFunc;
+
+glWaitSyncPtr glWaitSyncFunc;
+
+glGetInteger64vPtr glGetInteger64vFunc;
+
+glGetSyncivPtr glGetSyncivFunc;
+
+#define CHK_GL_ERR() printf("%s\n", gluErrorString(glGetError()))
+
+#define cl_khr_gl_event 1
+
+static void InitSyncFns() {
+#ifdef WIN_OS
+  glFenceSyncFunc = (glFenceSyncPtr)wglGetProcAddress("glFenceSync");
+  glIsSyncFunc = (glIsSyncPtr)wglGetProcAddress("glIsSync");
+  glDeleteSyncFunc = (glDeleteSyncPtr)wglGetProcAddress("glDeleteSync");
+  glClientWaitSyncFunc =
+      (glClientWaitSyncPtr)wglGetProcAddress("glClientWaitSync");
+  glWaitSyncFunc = (glWaitSyncPtr)wglGetProcAddress("glWaitSync");
+  glGetInteger64vFunc =
+      (glGetInteger64vPtr)wglGetProcAddress("glGetInteger64v");
+  glGetSyncivFunc = (glGetSyncivPtr)wglGetProcAddress("glGetSynciv");
+#else
+  glFenceSyncFunc = (glFenceSyncPtr)glXGetProcAddress((GLubyte *)"glFenceSync");
+  glIsSyncFunc = (glIsSyncPtr)glXGetProcAddress((GLubyte *)"glIsSync");
+  glDeleteSyncFunc =
+      (glDeleteSyncPtr)glXGetProcAddress((GLubyte *)"glDeleteSync");
+  glClientWaitSyncFunc =
+      (glClientWaitSyncPtr)glXGetProcAddress((GLubyte *)"glClientWaitSync");
+  glWaitSyncFunc = (glWaitSyncPtr)glXGetProcAddress((GLubyte *)"glWaitSync");
+  glGetInteger64vFunc =
+      (glGetInteger64vPtr)glXGetProcAddress((GLubyte *)"glGetInteger64v");
+  glGetSyncivFunc = (glGetSyncivPtr)glXGetProcAddress((GLubyte *)"glGetSynciv");
+#endif
+}
+
+#define USING_ARB_sync 1
+
+typedef cl_event(CL_API_CALL *clCreateEventFromGLsyncKHR_fn)(
+    cl_context context, GLsync sync, cl_int *errCode_ret);
+
+clCreateEventFromGLsyncKHR_fn clCreateEventFromGLsyncKHR_ptr;
+
+/* Helper to determine if an extension is supported by a device */
+int is_extension_available(cl_device_id device, const char *extensionName) {
+  char *extString;
+  size_t size = 0;
+  int err;
+  int result = -1;
+
+  if ((err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &size))) {
+    printf(
+        "Error: failed to determine size of device extensions string (err = "
+        "%d)\n",
+        err);
+    return -2;
+  }
+
+  if (0 == size) return -3;
+
+  extString = (char *)malloc(size);
+  if (NULL == extString) {
+    printf(
+        "Error: unable to allocate %ld byte buffer for extension string (err = "
+        "%d)\n",
+        (long)size, err);
+    return -40;
+  }
+
+  if ((err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, size, extString,
+                             NULL))) {
+    printf("Error: failed to obtain device extensions string (err = %d)\n",
+           err);
+    free(extString);
+    return -5;
+  }
+
+  if (strstr(extString, extensionName)) result = 0;
+
+  free(extString);
+  return result;
+}
+
+void OCLGLFenceSync::open(unsigned int test, char *units, double &conversion,
+                          unsigned int deviceId) {
+  _openTest = test;
+
+  // Initialize random number seed
+  srand((unsigned int)time(NULL));
+
+  OCLGLCommon::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+
+  cl_context_properties properties[7] = {0};
+  for (unsigned int i = 0; i < c_glContextCount; i++) {
+    error_ = is_extension_available(devices_[_deviceId], "cl_khr_gl_event");
+    if (error_ != CL_SUCCESS) {
+      printf("Silent failure: cl_khr_gl_event extension not available (%d)\n",
+             error_);
+      extensionSupported_ = false;
+      return;
+    }
+    extensionSupported_ = true;
+
+    createGLContext(contextData_[i].glContext);
+    getCLContextPropertiesFromGLContext(contextData_[i].glContext, properties);
+
+    // Create new CL context from GL context
+    contextData_[i].clContext = _wrapper->clCreateContext(
+        properties, 1, &devices_[_deviceId], NULL, NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext() failed (%d)",
+                 error_);
+
+    // Create command queue for new context
+    contextData_[i].clCmdQueue = _wrapper->clCreateCommandQueue(
+        contextData_[i].clContext, devices_[_deviceId], 0, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed (%d)",
+                 error_);
+
+    // Build the kernel
+    contextData_[i].clProgram = _wrapper->clCreateProgramWithSource(
+        contextData_[i].clContext, 1, &strKernel, NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "clCreateProgramWithSource()  failed (%d)", error_);
+
+    error_ = _wrapper->clBuildProgram(contextData_[i].clProgram, 1,
+                                      &devices_[deviceId], NULL, NULL, NULL);
+    if (error_ != CL_SUCCESS) {
+      char programLog[1024];
+      _wrapper->clGetProgramBuildInfo(contextData_[i].clProgram,
+                                      devices_[deviceId], CL_PROGRAM_BUILD_LOG,
+                                      1024, programLog, 0);
+      printf("\n%s\n", programLog);
+      fflush(stdout);
+    }
+    CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)",
+                 error_);
+
+    contextData_[i].clKernel = _wrapper->clCreateKernel(
+        contextData_[i].clProgram, "glmulticontext_test", &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)",
+                 error_);
+  }
+}
+
+void OCLGLFenceSync::run() {
+  if (_errorFlag || !extensionSupported_) {
+    return;
+  }
+
+  CPerfCounter timer;
+  double sec;
+  float perf;
+  cl_uint4 inOutData[c_numOfElements] = {{{0}}};
+  cl_uint4 expectedData[c_numOfElements] = {{{0}}};
+  unsigned int m = sizeof(cl_uint4) / sizeof(cl_uint);
+  int count = 0;
+  // Initialize input data with random values
+  for (unsigned int i = 0; i < c_numOfElements; i++) {
+    for (unsigned int j = 0; j < m; j++) {
+      inOutData[i].s[j] = (unsigned int)i;
+      expectedData[i].s[j] = inOutData[i].s[j] + c_glContextCount;
+    }
+  }
+
+  cl_event fenceEvent0 = NULL, fenceEvent = NULL;
+  GLsync glFence0 = NULL, glFence = NULL;
+  InitSyncFns();
+
+  clCreateEventFromGLsyncKHR_ptr =
+      (clCreateEventFromGLsyncKHR_fn)clGetExtensionFunctionAddress(
+          "clCreateEventFromGLsyncKHR");
+  if (clCreateEventFromGLsyncKHR_ptr == NULL) {
+    printf(
+        "ERROR: Unable to run fence_sync test (clCreateEventFromGLsyncKHR "
+        "function not discovered!)\n");
+    return;
+  }
+
+  for (unsigned int i = 0; i < c_glContextCount; i++) {
+    makeCurrent(contextData_[i].glContext);
+
+    // Generate and Bind in & out OpenGL buffers
+    GLuint inGLBuffer = 0, outGLBuffer = 0;
+    glGenBuffers(1, &inGLBuffer);
+    glGenBuffers(1, &outGLBuffer);
+
+    glBindBuffer(GL_ARRAY_BUFFER, inGLBuffer);
+    glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), inOutData,
+                 GL_STATIC_DRAW);
+
+    glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer);
+    glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), NULL,
+                 GL_STATIC_DRAW);
+
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+
+    glFinish();
+
+    // Checking if clWaitForEvents works
+    switch (_openTest) {
+      case 0:  // Using fence sync
+        glFence0 = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+        CHECK_RESULT((glFence0 == NULL), "Unable to create GL fence");
+
+        fenceEvent0 = clCreateEventFromGLsyncKHR_ptr(contextData_[i].clContext,
+                                                     glFence0, &error_);
+        CHECK_RESULT((error_ != CL_SUCCESS),
+                     "Unable to create CL event from GL fence (%d)", error_);
+
+        error_ = clWaitForEvents(1, &fenceEvent0);
+        CHECK_RESULT((error_ != CL_SUCCESS), "clWaitForEvents() failed (%d)",
+                     error_);
+        break;
+      default:
+        glFinish();
+        break;
+    }
+
+    if (fenceEvent != NULL) {
+      clReleaseEvent(fenceEvent0);
+      glDeleteSync(glFence0);
+    }
+
+    cl_event acqEvent1 = 0, acqEvent2 = 0, kernelEvent = 0, relEvent1 = 0,
+             relEvent2 = 0;
+
+    // Create input buffer from GL input buffer
+    contextData_[i].inputBuffer = _wrapper->clCreateFromGLBuffer(
+        contextData_[i].clContext, CL_MEM_READ_ONLY, inGLBuffer, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "Unable to create input GL buffer (%d)", error_);
+
+    // Create output buffer from GL output buffer
+    contextData_[i].outputBuffer = _wrapper->clCreateFromGLBuffer(
+        contextData_[i].clContext, CL_MEM_WRITE_ONLY, outGLBuffer, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "Unable to create output GL buffer (%d)", error_);
+
+    timer.Reset();
+    switch (_openTest) {
+      case 0:  // Using fence sync
+        timer.Start();
+        glFence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+        timer.Stop();
+        CHECK_RESULT((glFence == NULL), "Unable to create GL fence");
+
+        timer.Start();
+        fenceEvent = clCreateEventFromGLsyncKHR_ptr(contextData_[i].clContext,
+                                                    glFence, &error_);
+        timer.Stop();
+        CHECK_RESULT((error_ != CL_SUCCESS),
+                     "Unable to create CL event from GL fence (%d)", error_);
+        break;
+      default:
+        break;
+    }
+
+    error_ =
+        _wrapper->clSetKernelArg(contextData_[i].clKernel, 0, sizeof(cl_mem),
+                                 &(contextData_[i].inputBuffer));
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
+                 error_);
+
+    error_ =
+        _wrapper->clSetKernelArg(contextData_[i].clKernel, 1, sizeof(cl_mem),
+                                 &(contextData_[i].outputBuffer));
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
+                 error_);
+
+    switch (_openTest) {
+      case 0:  // Using fence sync
+        timer.Start();
+        error_ = _wrapper->clEnqueueAcquireGLObjects(
+            contextData_[i].clCmdQueue, 1, &(contextData_[i].inputBuffer), 1,
+            &fenceEvent, &acqEvent1);
+        timer.Stop();
+        CHECK_RESULT((error_ != CL_SUCCESS),
+                     "Unable to acquire GL objects (%d)", error_);
+
+        timer.Start();
+        error_ = _wrapper->clEnqueueAcquireGLObjects(
+            contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 1,
+            &fenceEvent, &acqEvent2);
+        timer.Stop();
+        CHECK_RESULT((error_ != CL_SUCCESS),
+                     "Unable to acquire GL objects (%d)", error_);
+        break;
+      case 1:  // Using glFinish
+        timer.Start();
+        glFinish();
+        timer.Stop();
+
+        timer.Start();
+        error_ = _wrapper->clEnqueueAcquireGLObjects(
+            contextData_[i].clCmdQueue, 1, &(contextData_[i].inputBuffer), 0,
+            NULL, &acqEvent1);
+        timer.Stop();
+        CHECK_RESULT((error_ != CL_SUCCESS),
+                     "Unable to acquire GL objects (%d)", error_);
+
+        timer.Start();
+        error_ = _wrapper->clEnqueueAcquireGLObjects(
+            contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 0,
+            NULL, &acqEvent2);
+        timer.Stop();
+        CHECK_RESULT((error_ != CL_SUCCESS),
+                     "Unable to acquire GL objects (%d)", error_);
+        break;
+      default:
+        break;
+    }
+
+    size_t gws[1] = {c_numOfElements};
+    cl_event evts[2] = {acqEvent1, acqEvent2};
+    error_ = _wrapper->clEnqueueNDRangeKernel(contextData_[i].clCmdQueue,
+                                              contextData_[i].clKernel, 1, NULL,
+                                              gws, NULL, 2, evts, &kernelEvent);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed (%d)",
+                 error_);
+
+    error_ = _wrapper->clEnqueueReleaseGLObjects(contextData_[i].clCmdQueue, 1,
+                                                 &(contextData_[i].inputBuffer),
+                                                 1, &kernelEvent, &relEvent1);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "clEnqueueReleaseGLObjects failed (%d)", error_);
+
+    error_ = _wrapper->clEnqueueReleaseGLObjects(
+        contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 1,
+        &kernelEvent, &relEvent2);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "clEnqueueReleaseGLObjects failed (%d)", error_);
+
+    evts[0] = relEvent1;
+    evts[1] = relEvent2;
+    error_ = clWaitForEvents(2, evts);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clWaitForEvents() failed (%d)",
+                 error_);
+
+    glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer);
+    void *glMem = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY);
+    memcpy(inOutData, glMem, c_numOfElements * sizeof(cl_uint4));
+    glUnmapBuffer(GL_ARRAY_BUFFER);
+
+    _wrapper->clReleaseMemObject(contextData_[i].inputBuffer);
+    _wrapper->clReleaseMemObject(contextData_[i].outputBuffer);
+
+    // Delete GL buffers
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+    glDeleteBuffers(1, &inGLBuffer);
+    inGLBuffer = 0;
+    glDeleteBuffers(1, &outGLBuffer);
+    outGLBuffer = 0;
+  }
+
+  sec = timer.GetElapsedTime();
+  perf = (float)sec * 1000000;  // in microseconds
+  _perfInfo = (float)perf;
+
+  if (fenceEvent != NULL) {
+    clReleaseEvent(fenceEvent);
+    glDeleteSync(glFence);
+  }
+
+  // Compare expected output with actual data received
+  for (unsigned int i = 0; i < c_numOfElements; i++) {
+    for (unsigned int j = 0; j < m; j++) {
+      if (inOutData[i].s[j] != expectedData[i].s[j]) {
+        printf(
+            "Element %u is incorrect!\t expected:[ %u, %u, %u, %u ] differs "
+            "from actual:{%u, %u, %u, %u}\n",
+            i, expectedData[i].s[0], expectedData[i].s[1], expectedData[i].s[2],
+            expectedData[i].s[3], inOutData[i].s[0], inOutData[i].s[1],
+            inOutData[i].s[2], inOutData[i].s[3]);
+
+        count++;
+      }
+    }
+  }
+  if (count) printf("Number of elements wrong: %d\n", count);
+}
+
+unsigned int OCLGLFenceSync::close() {
+  error_ = is_extension_available(devices_[_deviceId], "cl_khr_gl_event");
+  if (error_ == CL_SUCCESS) {
+    for (unsigned int i = 0; i < c_glContextCount; i++) {
+      makeCurrent(contextData_[i].glContext);
+      _wrapper->clReleaseKernel(contextData_[i].clKernel);
+      _wrapper->clReleaseProgram(contextData_[i].clProgram);
+      _wrapper->clReleaseCommandQueue(contextData_[i].clCmdQueue);
+      _wrapper->clReleaseContext(contextData_[i].clContext);
+      destroyGLContext(contextData_[i].glContext);
+    }
+  }
+
+  return OCLGLCommon::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLFenceSync.h b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLFenceSync.h
new file mode 100644
index 0000000000..af168485cc
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLFenceSync.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GL_FENCE_SYNC_H_
+#define _OCL_GL_FENCE_SYNC_H_
+
+#include "OCLGLCommon.h"
+
+class OCLGLFenceSync : public OCLGLCommon {
+ public:
+  OCLGLFenceSync();
+  virtual ~OCLGLFenceSync();
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  static const unsigned int c_glContextCount = 1;
+  static const unsigned int c_numOfElements = 8192;
+
+  struct GLContextDataSet {
+    OCLGLHandle glContext;
+    cl_context clContext;
+    cl_command_queue clCmdQueue;
+    cl_program clProgram;
+    cl_kernel clKernel;
+    cl_mem inputBuffer;
+    cl_mem outputBuffer;
+  };
+  GLContextDataSet contextData_[c_glContextCount];
+
+  bool failed_;
+  bool extensionSupported_;
+};
+
+#endif  // _OCL_GL_FENCE_SYNC_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMsaaTexture.cpp b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMsaaTexture.cpp
new file mode 100644
index 0000000000..c2ba6a10f4
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMsaaTexture.cpp
@@ -0,0 +1,298 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLMsaaTexture.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+const static char* strKernel =
+    "__kernel void gl_msaa_test( __global uint4 *output, read_only "
+    "image2d_msaa_t source, unsigned int numSamples){   \n"
+    "    int  tidX = get_global_id(0);\n"
+    "    int  tidY = get_global_id(1);\n"
+    "    for (int i = 0 ; i < numSamples ; i++) {\n"
+    "       uint4 value = read_imageui( source, (int2)( tidX, tidY ) ,i);\n"
+    "       int index = (tidY * get_image_width( source ) + tidX)*numSamples + "
+    "i;\n"
+    "       output[ index ] =  value;\n"
+    "   }\n"
+    "}\n";
+
+const static char* glDownSampleShader =
+    "uniform sampler2DMS MsaaTex;\n"
+    "uniform int numSamples;\n"
+    "uniform ivec2 resolution;\n"
+    "\n"
+    "varying vec4  gl_TexCoord[ ];  \n"
+    "\n"
+    "void main(void)\n"
+    "{\n"
+    "    vec4 accum = vec4(0.0,0.0,0.0,0.0);\n"
+    "    ivec2 coord = ivec2(resolution * gl_TexCoord[0].xy) ;\n"
+    "    for ( int i = 0 ; i < numSamples ; i++)\n"
+    "    {\n"
+    "        accum += texelFetch(MsaaTex,coord,i);\n"
+    "    }\n"
+    "    accum /= numSamples;\n"
+    "    \n"
+    "  \n"
+    "        \n"
+    "    gl_FragColor = accum;\n"
+    "}";
+
+OCLGLMsaaTexture::OCLGLMsaaTexture()
+    : msaaDepthBuffer_(0),
+      msaaFrameBufferOBJ_(0),
+      msaaColorBuffer_(0),
+      glShader_(0),
+      glprogram_(0),
+      clOutputBuffer_(0),
+      clMsaa_(0),
+      pGLOutput_(0),
+      pCLOutput_(0) {
+  _numSubTests = 1;
+  _currentTest = 0;
+}
+
+OCLGLMsaaTexture::~OCLGLMsaaTexture() {}
+
+void OCLGLMsaaTexture::open(unsigned int test, char* units, double& conversion,
+                            unsigned int deviceId) {
+  OCLGLCommon::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+
+  _currentTest = test;
+
+  // Build the kernel
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateProgramWithSource()  failed (%d)", error_);
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_);
+
+  kernel_ = _wrapper->clCreateKernel(program_, "gl_msaa_test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_);
+}
+
+void OCLGLMsaaTexture::run(void) {
+  if (_errorFlag) {
+    return;
+  }
+  bool retVal;
+  switch (_currentTest) {
+    case 0:
+      retVal = testMsaaRead(GL_RGBA, 2);
+      break;
+    default:
+      CHECK_RESULT(true, "unsupported test number\n");
+  }
+  CHECK_RESULT((retVal != true), "cl-gl depth test failed ");
+}
+
+unsigned int OCLGLMsaaTexture::close(void) {
+  if (pGLOutput_) {
+    free(pGLOutput_);
+    pGLOutput_ = NULL;
+  }
+
+  if (pCLOutput_) {
+    free(pCLOutput_);
+    pCLOutput_ = NULL;
+  }
+
+  clReleaseMemObject(clMsaa_);
+  clReleaseMemObject(clOutputBuffer_);
+
+  glFinish();
+  // unbind the texture and frame buffer.
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, 0, 0);
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, 0, 0);
+  glBindFramebuffer(GL_FRAMEBUFFER, 0);
+  glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, 0);
+
+  // clean gl resources
+  glDeleteFramebuffers(1, &msaaFrameBufferOBJ_);
+  msaaFrameBufferOBJ_ = 0;
+  glDeleteTextures(1, &msaaColorBuffer_);
+  msaaColorBuffer_ = 0;
+  glDeleteTextures(1, &msaaDepthBuffer_);
+  msaaDepthBuffer_ = 0;
+
+  glDeleteProgram(glprogram_);
+  glDeleteShader(glShader_);
+
+  return OCLGLCommon::close();
+}
+
+bool OCLGLMsaaTexture::testMsaaRead(GLint internalFormat,
+                                    unsigned int numSamples) {
+  size_t dimSizes[] = {c_dimSize, c_dimSize};
+
+  unsigned int bufferSize = c_dimSize * c_dimSize * 4;
+  bool retVal = false;
+  createGLFragmentProgramFromSource(glDownSampleShader, glShader_, glprogram_);
+
+  /////////////////////
+  // create msaa FBO //
+  /////////////////////
+  glGenFramebuffers(1, &msaaFrameBufferOBJ_);
+  glBindFramebuffer(GL_FRAMEBUFFER, msaaFrameBufferOBJ_);
+
+  // create   textures
+  glGenTextures(1, &msaaColorBuffer_);
+  glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, msaaColorBuffer_);
+  glTexImage2DMultisample(GL_TEXTURE_2D_MULTISAMPLE, numSamples, GL_RGBA8,
+                          c_dimSize, c_dimSize, GL_TRUE);
+
+  glGenTextures(1, &msaaDepthBuffer_);
+  glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, msaaDepthBuffer_);
+  glTexImage2DMultisample(GL_TEXTURE_2D_MULTISAMPLE, numSamples,
+                          GL_DEPTH_COMPONENT24, c_dimSize, c_dimSize, GL_TRUE);
+
+  //
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, msaaColorBuffer_,
+                       0);
+  glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, msaaDepthBuffer_,
+                       0);
+
+  // verify all resource allocations are well.
+  GLenum status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
+  if (GL_FRAMEBUFFER_COMPLETE != status) {
+    return false;
+  }
+  // set up gl state machine
+  glViewport(0, 0, c_dimSize, c_dimSize);  // Reset The Current Viewport
+  glMatrixMode(GL_PROJECTION);             // Select The Projection Matrix
+  glLoadIdentity();                        // Reset The Projection Matrix
+  gluPerspective(30.0f, (GLfloat)c_dimSize / (GLfloat)c_dimSize, 0.1f, 100.0f);
+  glMatrixMode(GL_MODELVIEW);  // Select The Modelview Matrix
+  glLoadIdentity();
+  glEnable(GL_DEPTH_TEST);
+  // The Type Of Depth Testing To Do
+  glClear(GL_COLOR_BUFFER_BIT |
+          GL_DEPTH_BUFFER_BIT);     // Clear Screen And Depth Buffer
+  glBegin(GL_QUADS);                // Draw A Quad
+  glVertex3f(-1.0f, 1.0f, -6.0f);   // Top Left
+  glVertex3f(1.0f, 1.0f, -6.0f);    // Top Right
+  glVertex3f(1.0f, -1.0f, -3.0f);   // Bottom Right
+  glVertex3f(-1.0f, -1.0f, -3.0f);  // Bottom Left
+  glEnd();
+
+  glFinish();
+  cl_int error;
+  clOutputBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
+                                             bufferSize, NULL, &error);
+  if (CL_SUCCESS != error) return false;
+
+  clMsaa_ = _wrapper->clCreateFromGLTexture(context_, CL_MEM_READ_WRITE,
+                                            GL_TEXTURE_2D_MULTISAMPLE, 0,
+                                            msaaColorBuffer_, &error);
+  if (CL_SUCCESS != error) return false;
+
+  GLsizei samples;
+  error = _wrapper->clGetGLTextureInfo(clMsaa_, CL_GL_NUM_SAMPLES,
+                                       sizeof(samples), &samples, NULL);
+
+  error = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 1,
+                                              &clMsaa_, 0, NULL, NULL);
+  if (CL_SUCCESS != error) return false;
+
+  _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clOutputBuffer_);
+
+  _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), &clMsaa_);
+
+  _wrapper->clSetKernelArg(kernel_, 2, sizeof(unsigned int), &numSamples);
+
+  _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, NULL,
+                                   dimSizes, NULL, 0, NULL, NULL);
+
+  _wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 1, &clMsaa_, 0,
+                                      NULL, NULL);
+
+  pGLOutput_ = (unsigned int*)malloc(bufferSize);
+  pCLOutput_ = (unsigned int*)malloc(bufferSize);
+
+  _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], clOutputBuffer_, CL_TRUE,
+                                0, bufferSize, pCLOutput_, 0, NULL, NULL);
+
+  // down sample
+  glBindFramebuffer(GL_FRAMEBUFFER, 0);
+  glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, msaaColorBuffer_);
+  glUseProgram(glprogram_);
+
+  glUniform1i(glGetUniformLocation(glprogram_, "numSamples"), numSamples);
+  glUniform2i(glGetUniformLocation(glprogram_, "resolution"), c_dimSize,
+              c_dimSize);
+  glUniform1i(glGetUniformLocation(glprogram_, "MsaaTex"), 0);
+
+  // printOpenGLError();
+
+  glBegin(GL_QUADS);
+  glVertex2f(-1.0f, 1.0f);
+  glTexCoord2f(1.0f, 0.0f);
+  glVertex2f(1.0f, 1.0f);
+  glTexCoord2f(1.0f, 1.0f);
+  glVertex2f(1.0f, -1.0f);
+  glTexCoord2f(0.0f, 1.0f);
+  glVertex2f(-1.0f, -1.0f);
+  glTexCoord2f(0.0f, 0.0f);
+  glEnd();
+
+  glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, 0);
+  glUseProgram(0);
+
+  glReadPixels(0, 0, c_dimSize, c_dimSize, GL_BGRA, GL_UNSIGNED_BYTE,
+               pGLOutput_);
+
+  if (absDiff(pGLOutput_, pCLOutput_, c_dimSize)) retVal = true;
+
+  return retVal;
+}
+
+bool OCLGLMsaaTexture::absDiff(unsigned int* pGLBuffer, unsigned int* pCLBuffer,
+                               const unsigned int c_dimSize) {
+  bool retVal = true;
+  for (unsigned int i = 0; i < c_dimSize * c_dimSize; i++) {
+    char clPixel[4];
+    char glPixel[4];
+    char diff[4] = {0};
+    memcpy(clPixel, &(pCLBuffer[i]), sizeof(clPixel));
+    memcpy(glPixel, &(pGLBuffer[i]), sizeof(glPixel));
+
+    for (int j = 0; j < 4; j++) {
+      diff[j] = abs(clPixel[j] - glPixel[i]);
+      if (diff[j] > 10) retVal = false;
+    }
+  }
+  return retVal;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMsaaTexture.h b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMsaaTexture.h
new file mode 100644
index 0000000000..f3c1ab6296
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMsaaTexture.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GL_MSAA_TEXTURE_H_
+#define _OCL_GL_MSAA_TEXTURE_H_
+
+#include "OCLGLCommon.h"
+
+class OCLGLMsaaTexture : public OCLGLCommon {
+ public:
+  OCLGLMsaaTexture();
+  virtual ~OCLGLMsaaTexture();
+  static const unsigned int c_dimSize = 128;
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  ////////////////////
+  // test functions //
+  ////////////////////
+  bool testMsaaRead(GLint internalFormat, unsigned int NumSamples);
+  unsigned int _currentTest;
+
+  //////////////////////////////
+  // private helper functions //
+  //////////////////////////////
+
+  // returns element size in bytes.
+  static bool absDiff(unsigned int* pGLBuffer, unsigned int* pCLBuffer,
+                      const unsigned int dimSize);
+
+  /////////////////////
+  // private members //
+  /////////////////////
+  // GL resource identifiers
+  GLuint msaaDepthBuffer_;
+  GLuint msaaFrameBufferOBJ_;
+  GLuint msaaColorBuffer_;
+  GLuint glShader_;
+  GLuint glprogram_;
+  // CL identifiers
+  cl_mem clOutputBuffer_;
+  cl_mem clMsaa_;
+
+  unsigned int* pGLOutput_;
+  unsigned int* pCLOutput_;
+};
+
+#endif  // _OCL_GL_BUFFER_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMultiContext.cpp b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMultiContext.cpp
new file mode 100644
index 0000000000..f46640741e
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMultiContext.cpp
@@ -0,0 +1,231 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLMultiContext.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+const static char* strKernel =
+    "__kernel void glmulticontext_test( __global uint4 *source, __global uint4 "
+    "*dest)   \n"
+    "{                                                                         "
+    "         \n"
+    "    int  tid = get_global_id(0);                                          "
+    "         \n"
+    "    dest[ tid ] = source[ tid ] + (uint4)(1);                             "
+    "         \n"
+    "}                                                                         "
+    "         \n";
+
+OCLGLMultiContext::OCLGLMultiContext() {
+  memset(contextData_, 0, sizeof(contextData_));
+  _numSubTests = 1;
+}
+
+OCLGLMultiContext::~OCLGLMultiContext() {}
+
+void OCLGLMultiContext::open(unsigned int test, char* units, double& conversion,
+                             unsigned int deviceId) {
+  // Initialize random number seed
+  srand((unsigned int)time(NULL));
+
+  OCLGLCommon::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+
+  cl_context_properties properties[7] = {0};
+  for (unsigned int i = 0; i < c_glContextCount; i++) {
+    createGLContext(contextData_[i].glContext);
+    getCLContextPropertiesFromGLContext(contextData_[i].glContext, properties);
+
+    // Create new CL context from GL context
+    contextData_[i].clContext = _wrapper->clCreateContext(
+        properties, 1, &devices_[_deviceId], NULL, NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext() failed (%d)",
+                 error_);
+
+    // Create command queue for new context
+    contextData_[i].clCmdQueue = _wrapper->clCreateCommandQueue(
+        contextData_[i].clContext, devices_[_deviceId], 0, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed (%d)",
+                 error_);
+
+    // Build the kernel
+    contextData_[i].clProgram = _wrapper->clCreateProgramWithSource(
+        contextData_[i].clContext, 1, &strKernel, NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "clCreateProgramWithSource()  failed (%d)", error_);
+
+    error_ = _wrapper->clBuildProgram(contextData_[i].clProgram, 1,
+                                      &devices_[deviceId], NULL, NULL, NULL);
+    if (error_ != CL_SUCCESS) {
+      char programLog[1024];
+      _wrapper->clGetProgramBuildInfo(contextData_[i].clProgram,
+                                      devices_[deviceId], CL_PROGRAM_BUILD_LOG,
+                                      1024, programLog, 0);
+      printf("\n%s\n", programLog);
+      fflush(stdout);
+    }
+    CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)",
+                 error_);
+
+    contextData_[i].clKernel = _wrapper->clCreateKernel(
+        contextData_[i].clProgram, "glmulticontext_test", &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)",
+                 error_);
+  }
+}
+
+void OCLGLMultiContext::run() {
+  if (_errorFlag) {
+    return;
+  }
+
+  cl_uint4 inOutData[c_numOfElements] = {{{0}}};
+  cl_uint4 expectedData[c_numOfElements] = {{{0}}};
+
+  // Initialize input data with random values
+  for (unsigned int i = 0; i < c_numOfElements; i++) {
+    for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) {
+      inOutData[i].s[j] = (unsigned int)rand();
+      expectedData[i].s[j] = inOutData[i].s[j] + c_glContextCount;
+    }
+  }
+
+  for (unsigned int i = 0; i < c_glContextCount; i++) {
+    makeCurrent(contextData_[i].glContext);
+
+    // Generate and Bind in & out OpenGL buffers
+    GLuint inGLBuffer = 0, outGLBuffer = 0;
+    glGenBuffers(1, &inGLBuffer);
+    glGenBuffers(1, &outGLBuffer);
+
+    glBindBuffer(GL_ARRAY_BUFFER, inGLBuffer);
+    glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), inOutData,
+                 GL_STATIC_DRAW);
+
+    glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer);
+    glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), NULL,
+                 GL_STATIC_DRAW);
+
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+    glFinish();
+
+    // Create input buffer from GL input buffer
+    contextData_[i].inputBuffer = _wrapper->clCreateFromGLBuffer(
+        contextData_[i].clContext, CL_MEM_READ_ONLY, inGLBuffer, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "Unable to create input GL buffer (%d)", error_);
+
+    // Create output buffer from GL output buffer
+    contextData_[i].outputBuffer = _wrapper->clCreateFromGLBuffer(
+        contextData_[i].clContext, CL_MEM_WRITE_ONLY, outGLBuffer, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "Unable to create output GL buffer (%d)", error_);
+
+    error_ =
+        _wrapper->clSetKernelArg(contextData_[i].clKernel, 0, sizeof(cl_mem),
+                                 &(contextData_[i].inputBuffer));
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
+                 error_);
+
+    error_ =
+        _wrapper->clSetKernelArg(contextData_[i].clKernel, 1, sizeof(cl_mem),
+                                 &(contextData_[i].outputBuffer));
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
+                 error_);
+
+    error_ = _wrapper->clEnqueueAcquireGLObjects(contextData_[i].clCmdQueue, 1,
+                                                 &(contextData_[i].inputBuffer),
+                                                 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)",
+                 error_);
+
+    error_ = _wrapper->clEnqueueAcquireGLObjects(
+        contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 0, NULL,
+        NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)",
+                 error_);
+
+    size_t gws[1] = {c_numOfElements};
+    error_ = _wrapper->clEnqueueNDRangeKernel(contextData_[i].clCmdQueue,
+                                              contextData_[i].clKernel, 1, NULL,
+                                              gws, NULL, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed (%d)",
+                 error_);
+
+    error_ = _wrapper->clEnqueueReleaseGLObjects(contextData_[i].clCmdQueue, 1,
+                                                 &(contextData_[i].inputBuffer),
+                                                 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "clEnqueueReleaseGLObjects failed (%d)", error_);
+
+    error_ = _wrapper->clEnqueueReleaseGLObjects(
+        contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 0, NULL,
+        NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "clEnqueueReleaseGLObjects failed (%d)", error_);
+
+    error_ = _wrapper->clFinish(contextData_[i].clCmdQueue);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clFinish() failed (%d)", error_);
+
+    glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer);
+    void* glMem = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY);
+    memcpy(inOutData, glMem, c_numOfElements * sizeof(cl_uint4));
+    glUnmapBuffer(GL_ARRAY_BUFFER);
+
+    _wrapper->clReleaseMemObject(contextData_[i].inputBuffer);
+    _wrapper->clReleaseMemObject(contextData_[i].outputBuffer);
+
+    // Delete GL buffers
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+    glDeleteBuffers(1, &inGLBuffer);
+    inGLBuffer = 0;
+    glDeleteBuffers(1, &outGLBuffer);
+    outGLBuffer = 0;
+  }
+
+  // Compare expected output with actual data received
+  for (unsigned int i = 0; i < c_numOfElements; i++) {
+    for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) {
+      CHECK_RESULT((inOutData[i].s[j] != expectedData[i].s[j]),
+                   "Element %d is incorrect!\n\t \
+                                                                       expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}",
+                   i, expectedData[i].s[0], expectedData[i].s[1],
+                   expectedData[i].s[2], expectedData[i].s[3],
+                   inOutData[i].s[0], inOutData[i].s[1], inOutData[i].s[2],
+                   inOutData[i].s[3]);
+    }
+  }
+}
+
+unsigned int OCLGLMultiContext::close() {
+  for (unsigned int i = 0; i < c_glContextCount; i++) {
+    makeCurrent(contextData_[i].glContext);
+    _wrapper->clReleaseKernel(contextData_[i].clKernel);
+    _wrapper->clReleaseProgram(contextData_[i].clProgram);
+    _wrapper->clReleaseCommandQueue(contextData_[i].clCmdQueue);
+    _wrapper->clReleaseContext(contextData_[i].clContext);
+    destroyGLContext(contextData_[i].glContext);
+  }
+  return OCLGLCommon::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMultiContext.h b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMultiContext.h
new file mode 100644
index 0000000000..14983339f3
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMultiContext.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GL_MULTI_CONTEXT_H_
+#define _OCL_GL_MULTI_CONTEXT_H_
+
+#include "OCLGLCommon.h"
+
+class OCLGLMultiContext : public OCLGLCommon {
+ public:
+  OCLGLMultiContext();
+  virtual ~OCLGLMultiContext();
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  static const unsigned int c_glContextCount = 3;
+  static const unsigned int c_numOfElements = 128;
+
+  struct GLContextDataSet {
+    OCLGLHandle glContext;
+    cl_context clContext;
+    cl_command_queue clCmdQueue;
+    cl_program clProgram;
+    cl_kernel clKernel;
+    cl_mem inputBuffer;
+    cl_mem outputBuffer;
+  };
+  GLContextDataSet contextData_[c_glContextCount];
+
+  bool failed_;
+};
+
+#endif  // _OCL_GL_MULTI_CONTEXT_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLTexture.cpp b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLTexture.cpp
new file mode 100644
index 0000000000..8b5a658893
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLTexture.cpp
@@ -0,0 +1,144 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGLTexture.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+const static char* strKernelui =
+    "__kernel void gltexture_test(read_only image2d_t source, write_only "
+    "image2d_t dest)    \n"
+    "{                                                                         "
+    "             \n"
+    "    int  tidX = get_global_id(0);                                         "
+    "             \n"
+    "    int  tidY = get_global_id(1);                                         "
+    "             \n"
+    "    uint4 pixel = read_imageui(source, (int2)(tidX, tidY));               "
+    "             \n"
+    "    write_imageui(dest, (int2)(tidX, tidY), pixel);                       "
+    "             \n"
+    "}";
+
+const static char* strKernelf =
+    "__kernel void gltexture_test(read_only image2d_t source, write_only "
+    "image2d_t dest)    \n"
+    "{                                                                         "
+    "             \n"
+    "    int  tidX = get_global_id(0);                                         "
+    "             \n"
+    "    int  tidY = get_global_id(1);                                         "
+    "             \n"
+    "    float4 pixel = read_imagef(source, (int2)(tidX, tidY));               "
+    "             \n"
+    "    write_imagef(dest, (int2)(tidX, tidY), pixel);                        "
+    "            \n"
+    "}                                                                         "
+    "             \n";
+
+OCLGLTexture::OCLGLTexture()
+    : inDataGL_(NULL), outDataGL_(NULL), inGLTexture_(0), outGLTexture_(0) {
+  _numSubTests = 4 * 2;
+}
+
+OCLGLTexture::~OCLGLTexture() {}
+
+void OCLGLTexture::open(unsigned int test, char* units, double& conversion,
+                        unsigned int deviceId) {
+  // Initialize random number seed
+  srand((unsigned int)time(NULL));
+
+  OCLGLCommon::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+
+  currentTest_ = test % 4;
+  testRender_ = ((test / 4) >= 1) ? true : false;
+
+  // Build the kernel
+  if (0 == currentTest_) {
+    program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernelui,
+                                                   NULL, &error_);
+
+  } else {
+    program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernelf,
+                                                   NULL, &error_);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateProgramWithSource()  failed (%d)", error_);
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_);
+
+  kernel_ = _wrapper->clCreateKernel(program_, "gltexture_test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_);
+}
+
+void OCLGLTexture::run(void) {
+  bool retVal = false;
+  switch (currentTest_) {
+    case 0:
+      retVal = runTextureTest<unsigned int>(GL_RGBA32UI, GL_RGBA_INTEGER,
+                                            GL_UNSIGNED_INT);
+      break;
+    case 1:
+      retVal =
+          runTextureTest<unsigned char>(GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE);
+      break;
+    case 2:
+      retVal = runTextureTest<short>(GL_RGBA16, GL_RGBA, GL_SHORT);
+      break;
+    case 3:
+      retVal = runTextureTest<float>(GL_RGBA32F, GL_RGBA, GL_FLOAT);
+      break;
+    default:
+      CHECK_RESULT(true, "unsupported test number\n");
+  }
+  CHECK_RESULT((retVal != true), "cl-gl texture interop test failed ");
+}
+
+unsigned int OCLGLTexture::close(void) {
+  clReleaseMemObject(buffers_[0]);
+  clReleaseMemObject(buffers_[1]);
+  buffers_.clear();
+  // Delete GL in & out buffers
+  glFinish();
+  glBindTexture(GL_TEXTURE_2D, 0);
+  glDeleteTextures(1, &inGLTexture_);
+  inGLTexture_ = 0;
+  glDeleteTextures(1, &outGLTexture_);
+  outGLTexture_ = 0;
+
+  free(inDataGL_);
+  inDataGL_ = NULL;
+  free(outDataGL_);
+  outDataGL_ = NULL;
+  return OCLGLCommon::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLTexture.h b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLTexture.h
new file mode 100644
index 0000000000..412eddbb37
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLTexture.h
@@ -0,0 +1,214 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GL_TEXTURE_H_
+#define _OCL_GL_TEXTURE_H_
+
+#include <iostream>
+
+#include "OCLGLCommon.h"
+
+class OCLGLTexture : public OCLGLCommon {
+ public:
+  static const unsigned int c_imageWidth = 512;
+  static const unsigned int c_imageHeight = 512;
+  static const unsigned int c_elementsPerPixel = 4;
+
+  OCLGLTexture();
+  virtual ~OCLGLTexture();
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  unsigned int currentTest_;
+  void* inDataGL_;
+  void* outDataGL_;
+  GLuint inGLTexture_;
+  GLuint outGLTexture_;
+  bool testRender_;
+  template <typename T>
+  bool runTextureTest(GLint internalFormat, GLenum format, GLenum type);
+};
+
+template <typename T>
+bool OCLGLTexture::runTextureTest(GLint internalFormat, GLenum format,
+                                  GLenum type) {
+  cl_mem image;
+  inDataGL_ =
+      malloc(c_imageWidth * c_imageHeight * c_elementsPerPixel * sizeof(T));
+  outDataGL_ =
+      malloc(c_imageWidth * c_imageHeight * c_elementsPerPixel * sizeof(T));
+
+  // Initialize input data with random values
+  T* inputIterator = (T*)inDataGL_;
+  for (unsigned int i = 0;
+       i < c_imageWidth * c_imageHeight * c_elementsPerPixel; i++) {
+    inputIterator[i] = (T)(rand() % 255);
+  }
+  // Initialize output data with zeros
+  memset(outDataGL_, 0,
+         c_imageWidth * c_imageHeight * c_elementsPerPixel * sizeof(T));
+
+  // Generate and Bind in & out OpenGL textures
+  glGenTextures(1, &inGLTexture_);
+  glGenTextures(1, &outGLTexture_);
+
+  glBindTexture(GL_TEXTURE_2D, inGLTexture_);
+  glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+  glTexImage2D(GL_TEXTURE_2D, 0, internalFormat, (GLsizei)c_imageWidth,
+               (GLsizei)c_imageHeight, 0, format, type, inDataGL_);
+
+  glBindTexture(GL_TEXTURE_2D, outGLTexture_);
+  glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+  glTexImage2D(GL_TEXTURE_2D, 0, internalFormat, (GLsizei)c_imageWidth,
+               (GLsizei)c_imageHeight, 0, format, type, outDataGL_);
+
+  glFinish();
+
+  // Create input buffer from GL input texture
+  image = _wrapper->clCreateFromGLTexture(
+      context_, CL_MEM_READ_ONLY, GL_TEXTURE_2D, 0, inGLTexture_, &error_);
+  if (error_ != CL_SUCCESS) {
+    printf("Unable to create input buffer from GL texture (%d)", error_);
+    return false;
+  }
+  buffers_.push_back(image);
+
+  // Create output buffer from GL output texture
+  image = _wrapper->clCreateFromGLTexture(
+      context_, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, outGLTexture_, &error_);
+  if (error_ != CL_SUCCESS) {
+    printf("Unable to create output buffer from GL texture (%d)", error_);
+    return false;
+  }
+  buffers_.push_back(image);
+  size_t gws[2] = {c_imageWidth, c_imageHeight};
+
+  // Assign args
+  for (unsigned int i = 0; i < buffers_.size(); i++) {
+    error_ =
+        _wrapper->clSetKernelArg(kernel_, i, sizeof(cl_mem), &buffers()[i]);
+    if (error_ != CL_SUCCESS) {
+      printf("clSetKernelArg() failed (%d)", error_);
+      return false;
+    }
+  }
+
+  int loop = (testRender_) ? 2 : 1;
+  for (int l = 0; l < loop; ++l) {
+    if (testRender_ && (l == 0)) {
+      GLuint FrameBufferName = 0;
+      glGenFramebuffers(1, &FrameBufferName);
+      glBindFramebuffer(GL_FRAMEBUFFER, FrameBufferName);
+      glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, inGLTexture_,
+                           0);
+      glClearColor(.5f, 1.f, 1.0f, 0);
+      glClear(GL_COLOR_BUFFER_BIT);
+      glFinish();
+    }
+
+    error_ = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 2,
+                                                 &buffers()[0], 0, NULL, NULL);
+    if (error_ != CL_SUCCESS) {
+      printf("Unable to acquire GL objects (%d)", error_);
+      return false;
+    }
+
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2,
+                                              NULL, gws, NULL, 0, NULL, NULL);
+    if (error_ != CL_SUCCESS) {
+      printf("clEnqueueNDRangeKernel() failed (%d)", error_);
+      return false;
+    }
+
+    error_ = _wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 2,
+                                                 &buffers()[0], 0, NULL, NULL);
+    if (error_ != CL_SUCCESS) {
+      printf("clEnqueueReleaseGLObjects failed (%d)", error_);
+      return false;
+    }
+
+    error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
+    if (error_ != CL_SUCCESS) {
+      printf("clFinish() failed (%d)", error_);
+      return false;
+    }
+
+    if (testRender_ && (l == 0)) {
+      glClearColor(1.f, 1.f, 1.f, 1.f);
+      glClear(GL_COLOR_BUFFER_BIT);
+      glFinish();
+    }
+  }
+
+  // Get the results from GL texture
+  glBindTexture(GL_TEXTURE_2D, outGLTexture_);
+  glActiveTexture(GL_TEXTURE0);
+  glGetTexImage(GL_TEXTURE_2D, 0, format, type, outDataGL_);
+
+  // Check output texture data
+  inputIterator = (T*)inDataGL_;
+  T* outputIterator = (T*)outDataGL_;
+  T color;
+  switch (type) {
+    case GL_UNSIGNED_INT:
+      color = (T)0x3f800000;
+      break;
+    case GL_UNSIGNED_BYTE:
+      color = (T)0xff;
+      break;
+    case GL_SHORT:
+      color = (T)0x7fff;
+      break;
+    case GL_FLOAT:
+      color = (T)1.f;
+      break;
+    default:
+      return false;
+  }
+  for (unsigned int i = 0;
+       i < c_imageWidth * c_imageHeight * c_elementsPerPixel; i++) {
+    if (testRender_) {
+      if (outputIterator[i] != color) {
+        std::cout << "Element " << i
+                  << " in output texture is incorrect! (internal format = "
+                  << internalFormat << "\n\t expected:" << inputIterator[i]
+                  << " differs from actual clear color:" << color << std::endl;
+        return false;
+      }
+    } else if (inputIterator[i] != outputIterator[i]) {
+      std::cout << "Element " << i
+                << " in output texture is incorrect! (internal format = "
+                << internalFormat << "\n\t expected:" << inputIterator[i]
+                << " differs from actual: " << outputIterator[i] << std::endl;
+      return false;
+    }
+  }
+  return true;
+}
+
+#endif  // _OCL_GL_TEXTURE_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/gl/TestList.cpp b/projects/clr/opencl/tests/ocltst/module/gl/TestList.cpp
new file mode 100644
index 0000000000..7a58fc06c6
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/gl/TestList.cpp
@@ -0,0 +1,54 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLTestListImp.h"
+
+//
+// Includes for tests
+//
+#include "OCLGLBuffer.h"
+#include "OCLGLBufferMultipleQueues.h"
+#include "OCLGLDepthBuffer.h"
+#include "OCLGLDepthTex.h"
+#include "OCLGLFenceSync.h"
+#include "OCLGLMsaaTexture.h"
+#include "OCLGLMultiContext.h"
+#include "OCLGLTexture.h"
+
+//
+//  Helper macro for adding tests
+//
+template <typename T>
+static void* dictionary_CreateTestFunc(void) {
+  return new T();
+}
+
+#define TEST(name) \
+  { #name, &dictionary_CreateTestFunc < name> }
+
+TestEntry TestList[] = {
+    TEST(OCLGLBuffer),    TEST(OCLGLBufferMultipleQueues),
+    TEST(OCLGLTexture),   TEST(OCLGLMultiContext),
+    TEST(OCLGLFenceSync), TEST(OCLGLDepthTex),
+};
+
+unsigned int TestListCount = sizeof(TestList) / sizeof(TestList[0]);
+unsigned int TestLibVersion = 0;
+const char* TestLibName = "oclgl";
diff --git a/projects/clr/opencl/tests/ocltst/module/gl/oclgl.exclude b/projects/clr/opencl/tests/ocltst/module/gl/oclgl.exclude
new file mode 100644
index 0000000000..39345e8fd7
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/gl/oclgl.exclude
@@ -0,0 +1 @@
+# all clear
diff --git a/projects/clr/opencl/tests/ocltst/module/include/BaseTestImp.h b/projects/clr/opencl/tests/ocltst/module/include/BaseTestImp.h
new file mode 100644
index 0000000000..92e730d534
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/include/BaseTestImp.h
@@ -0,0 +1,206 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _BaseTestImp_H_
+#define _BaseTestImp_H_
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <vector>
+
+#include "OCLTest.h"
+#include "OCLWrapper.h"
+
+#define EXIT_SILENT_FAILURE 2
+#define KERNEL(...) #__VA_ARGS__
+
+#ifdef _MSC_VER
+#define snprintf sprintf_s
+#endif
+
+#define CHECK_ERROR(error, msg)                       \
+  if (error != CL_SUCCESS) {                          \
+    _errorFlag = true;                                \
+    printf("\n\n%s\nError code: %d\n\n", msg, error); \
+    _errorMsg = msg;                                  \
+    _crcword += 1;                                    \
+    return;                                           \
+  }
+
+#define CHECK_ERROR_NO_RETURN(error, msg)             \
+  if (error != CL_SUCCESS) {                          \
+    _errorFlag = true;                                \
+    printf("\n\n%s\nError code: %d\n\n", msg, error); \
+    _errorMsg = msg;                                  \
+    _crcword += 1;                                    \
+  }
+
+#define CHECK_RESULT(test, msg, ...)                  \
+  if ((test)) {                                       \
+    char* buf = (char*)malloc(4096);                  \
+    _errorFlag = true;                                \
+    int rc = snprintf(buf, 4096, msg, ##__VA_ARGS__); \
+    assert(rc >= 0 && rc < (int)4096);                \
+    printf("%s:%d - %s\n", __FILE__, __LINE__, buf);  \
+    _errorMsg = std::string(buf);                     \
+    _crcword += 1;                                    \
+    free(buf);                                        \
+    return;                                           \
+  }
+
+#define CHECK_RESULT_ARGS CHECK_RESULT
+
+#define CHECK_RESULT_NO_RETURN(test, msg, ...)        \
+  if ((test)) {                                       \
+    char* buf = (char*)malloc(4096);                  \
+    _errorFlag = true;                                \
+    int rc = snprintf(buf, 4096, msg, ##__VA_ARGS__); \
+    assert(rc >= 0 && rc < (int)4096);                \
+    printf("%s:%d - %s\n", __FILE__, __LINE__, buf);  \
+    _errorMsg = std::string(msg);                     \
+    _crcword += 1;                                    \
+    free(buf);                                        \
+  }
+
+#define CHECK_RESULT_NO_RETURN_ARGS CHECK_RESULT_NO_RETURN
+
+#define CHECK_RESULT_SHUTDOWN(test, msg) \
+  if ((test)) {                          \
+    _errorFlag = true;                   \
+    printf("%s\n", msg);                 \
+    _errorMsg = msg;                     \
+    _crcword += 1;                       \
+    close();                             \
+    return;                              \
+  }
+
+#define CHECK_RESULT_CL(test, msg) \
+  if ((test)) {                    \
+    _errorFlag = true;             \
+    printf("%s\n", msg);           \
+    _errorMsg = msg;               \
+    _crcword += 1;                 \
+    return 1;                      \
+  }
+
+class BaseTestImp : public OCLTest {
+ public:
+  BaseTestImp();
+  virtual ~BaseTestImp();
+
+ public:
+  virtual unsigned int getThreadUsage(void);
+  virtual int getNumSubTests(void);
+
+  //! Abstract functions being defined here
+  virtual void open();
+  virtual void open(unsigned int test, const char* deviceName,
+                    unsigned int architecture);
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId, unsigned int platformIndex) {
+    return open(test, "Tahiti", platformIndex);
+  }
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId) {
+    return open(test, "Tahiti", 0);
+  }
+
+  virtual void run(void) = 0;
+  virtual unsigned int close(void);
+
+  //! Functions to set class members
+  virtual void checkComplib(unsigned int test, const char* deviceName,
+                            unsigned int architecture);
+  virtual void setDeviceName(const char*);
+  virtual const char* getDeviceName();
+  virtual void setErrorMsg(const char* error);
+  virtual const char* getErrorMsg(void);
+  virtual bool hasErrorOccured(void);
+  virtual void clearError();
+  BaseTestImp* toBaseTestImp() { return this; }
+  virtual OCLTestImp* toOCLTestImp() { return NULL; }
+  virtual void useCPU() { _cpu = true; }
+  virtual void setIterationCount(int cnt);
+  virtual void setDeviceId(unsigned int deviceId);
+  virtual unsigned int getDeviceId();
+  virtual void setPlatformIndex(unsigned int platformIndex);
+  virtual unsigned int getPlatformIndex();
+  virtual float getPerfInfo();
+  virtual void clearPerfInfo();
+
+ protected:
+  unsigned int _numSubTests;
+  unsigned int _openTest;
+  unsigned int _useThreads;
+  int _iterationCnt;
+  float _perfInfo;
+  bool _cpu;
+
+  unsigned int _crcword;
+  unsigned int _crctab[256];
+
+  bool _errorFlag;
+  std::string _errorMsg;
+
+  const char* _deviceName;
+  unsigned int _architecture;
+  unsigned int _deviceId;
+  unsigned int _platformIndex;
+  bool failed_ = false;
+  cl_int error_;
+  cl_uint type_;
+  cl_uint deviceCount_;
+  cl_device_id* devices_;
+  cl_context context_;
+
+  cl_program program_;
+  cl_kernel kernel_;
+};
+
+// enum to keep track of different memory types
+enum MemType { LOOCL, REMOTE_CACHED, REMOTE_UNCACHED };
+
+class DataType {
+  cl_image_format f;
+  const char* str;
+  unsigned int size;
+
+ public:
+  DataType() {}
+
+  DataType(cl_image_format f, const char* str, unsigned int size) {
+    this->f = f;
+    this->str = str;
+    this->size = size;
+  }
+  operator const char*() { return str; }
+
+  operator unsigned int() { return size; }
+  operator cl_image_format() { return f; }
+};
+
+// useful for initialization of an array of data types for a test
+#define DTYPE(x, y) DataType(x, #x, (unsigned int)y)
+
+#endif
diff --git a/projects/clr/opencl/tests/ocltst/module/include/OCLTestImp.h b/projects/clr/opencl/tests/ocltst/module/include/OCLTestImp.h
new file mode 100644
index 0000000000..fe32e08efa
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/include/OCLTestImp.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCLTestImp_H_
+#define _OCLTestImp_H_
+
+#include <string>
+#include <vector>
+
+#include "BaseTestImp.h"
+#include "CL/cl.h"
+#include "OCL/Thread.h"
+#include "OCLTest.h"
+#include "OCLWrapper.h"
+
+class OCLTestImp : public BaseTestImp {
+ public:
+  OCLTestImp();
+  virtual ~OCLTestImp();
+
+ public:
+  //! Abstract functions being defined here
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId, unsigned int platformIndex);
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void) = 0;
+  virtual unsigned int close(void);
+  //! Functions to set class members
+
+ public:
+  void useCPU();
+  int genIntRand(int a, int b);
+  int genBitRand(int n);
+  void accumulateCRC(const void* buffer, int len);
+  void setOCLWrapper(OCLWrapper* wrapper);
+  OCLTestImp* toOCLTestImp() { return this; }
+
+  static OCLutil::Lock openDeviceLock;
+  static OCLutil::Lock compileLock;
+
+ protected:
+  const std::vector<cl_mem>& buffers() const { return buffers_; }
+
+  OCLWrapper* _wrapper;
+
+  int _seed;
+
+  // Common data of any CL program
+  cl_int error_;
+  cl_uint type_;
+  cl_uint deviceCount_;
+  cl_device_id* devices_;
+  cl_platform_id platform_;
+  std::vector<cl_command_queue> cmdQueues_;
+  cl_context context_;
+
+  cl_program program_;
+  cl_kernel kernel_;
+  std::vector<cl_mem> buffers_;
+};
+
+// useful for initialization of an array of data types for a test
+#define DTYPE(x, y) DataType(x, #x, (unsigned int)y)
+
+#endif
diff --git a/projects/clr/opencl/tests/ocltst/module/include/OCLTestListImp.h b/projects/clr/opencl/tests/ocltst/module/include/OCLTestListImp.h
new file mode 100644
index 0000000000..5dfa6ffd13
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/include/OCLTestListImp.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef __Dictionary_h__
+#define __Dictionary_h__
+
+//
+// Testing module (plugin) interface forward declarations
+//
+#ifdef ATI_OS_WIN
+#define OCL_DLLEXPORT __declspec(dllexport)
+#define OCL_CALLCONV __cdecl
+#endif
+#ifdef ATI_OS_LINUX
+#define OCL_DLLEXPORT
+#define OCL_CALLCONV
+#endif
+
+class OCLTest;
+
+//
+//  OCLTestList_TestCount - retrieve the number of tests in the testing module
+//
+extern "C" OCL_DLLEXPORT unsigned int OCL_CALLCONV OCLTestList_TestCount(void);
+
+//
+//  OCLTestList_TestLibVersion - retrieve the version of test lib in the testing
+//  module
+//
+extern "C" OCL_DLLEXPORT unsigned int OCL_CALLCONV
+OCLTestList_TestLibVersion(void);
+
+//
+//  OCLTestList_TestLibName - retrieve the name of test library
+//
+extern "C" OCL_DLLEXPORT const char* OCL_CALLCONV OCLTestList_TestLibName(void);
+
+//
+//  OCLTestList_TestName - retrieve the name of the indexed test in the module
+//
+extern "C" OCL_DLLEXPORT const char* OCL_CALLCONV
+OCLTestList_TestName(unsigned int testNum);
+
+//
+//  OCLTestList_CreateTest - create a test by index
+//
+extern "C" OCL_DLLEXPORT OCLTest* OCL_CALLCONV
+OCLTestList_CreateTest(unsigned int testNum);
+
+//
+//  OCLTestList_DestroyTest - destroy a test object
+//
+extern "C" OCL_DLLEXPORT void OCL_CALLCONV
+OCLTestList_DestroyTest(OCLTest* test);
+
+//
+//  internal global data that is populated in each dll
+//
+typedef struct _TestEntry {
+  const char* name;
+  void* (*create)(void);
+} TestEntry;
+
+extern TestEntry TestList[];
+extern unsigned int TestListCount;
+extern unsigned int TestLibVersion;
+extern const char* TestLibName;
+
+#endif
diff --git a/projects/clr/opencl/tests/ocltst/module/include/OclIncludes.h b/projects/clr/opencl/tests/ocltst/module/include/OclIncludes.h
new file mode 100644
index 0000000000..50adba1c8c
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/include/OclIncludes.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_INCLUDES_H
+#define _OCL_INCLUDES_H
+
+#ifdef ATI_OS_WIN
+#define POINTER_64 __ptr64
+#include <windows.h>
+#include "d3d9.h"
+#endif
+
+#include "CL/cl.h"
+
+#endif  //_OCL_INCLUDES_H
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerf3DImageWriteSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerf3DImageWriteSpeed.cpp
new file mode 100644
index 0000000000..4121c15911
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerf3DImageWriteSpeed.cpp
@@ -0,0 +1,211 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerf3DImageWriteSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define KERNEL_CODE(...) #__VA_ARGS__
+
+#define NUM_SIZES 4
+static const unsigned int Sizes[NUM_SIZES] = {64, 128, 256, 512};
+
+#define NUM_FORMATS 1
+static const cl_image_format formats[NUM_FORMATS] = {
+    {CL_RGBA, CL_UNSIGNED_INT8}};
+static const char *textFormats[NUM_FORMATS] = {"CL_RGBA , CL_UNSIGNED_INT8"};
+static const unsigned int formatSize[NUM_FORMATS] = {sizeof(CL_UNSIGNED_INT8)};
+
+const static char *strKernel = {KERNEL_CODE(
+  \n __kernel void image_kernel(write_only image3d_t input) {
+  size_t x = get_global_id(0);
+  size_t y = get_global_id(1);
+  size_t z = get_global_id(2);
+
+  int4 coords = (int4)(x, y, z, 0);
+  write_imageui(input, coords, (1, 1, 1, 1));
+}
+  \n)};
+
+OCLPerf3DImageWriteSpeed::OCLPerf3DImageWriteSpeed() {
+  _numSubTests = NUM_SIZES * NUM_FORMATS;
+}
+
+OCLPerf3DImageWriteSpeed::~OCLPerf3DImageWriteSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerf3DImageWriteSpeed::open(unsigned int test, char *units,
+                                    double &conversion, unsigned int deviceId) {
+  error_ = CL_SUCCESS;
+  testId_ = test;
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  program_ = 0;
+  kernel_ = 0;
+  cmd_queue_ = 0;
+  imageBuffer_ = 0;
+  skip_ = false;
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_EXTENSIONS,
+                                     1024, charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  if (!strstr(charbuf, "cl_khr_3d_image_writes")) {
+    skip_ = true;
+    testDescString = "3D Write not supported. Test Skipped.";
+    return;
+  }
+
+  bufSize_ = Sizes[test % NUM_SIZES];
+  bufnum_ = (test / NUM_SIZES) % NUM_FORMATS;
+  memSize_ = bufSize_ * bufSize_ * bufSize_ * formatSize[bufnum_];
+
+  cmd_queue_ = cmdQueues_[_deviceId];
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "image_kernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  imageBuffer_ = _wrapper->clCreateImage3D(
+      context_, CL_MEM_WRITE_ONLY, &formats[bufnum_], bufSize_, bufSize_,
+      bufSize_, 0, 0, NULL, &error_);
+  CHECK_RESULT(imageBuffer_ == 0, "clCreateImage(imageBuffer_) failed");
+
+  // set kernel arguments
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &imageBuffer_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+}
+
+void OCLPerf3DImageWriteSpeed::run(void) {
+  if (skip_) {
+    return;
+  }
+
+  CPerfCounter timer;
+  unsigned int fmt_num = (testId_ / NUM_SIZES) % NUM_FORMATS;
+
+  size_t gws[3] = {bufSize_, bufSize_, bufSize_};
+  size_t lws[3] = {8, 8, 4};
+
+  // warm up
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, kernel_, 3, NULL, gws,
+                                            lws, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  _wrapper->clFinish(cmd_queue_);
+
+  // checkData
+  char *bufptr = (char *)malloc(memSize_);
+
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {bufSize_, bufSize_, bufSize_};
+  size_t image_row_pitch = bufSize_ * formatSize[bufnum_];
+  size_t image_slice_pitch = image_row_pitch * bufSize_;
+  error_ = clEnqueueReadImage(cmd_queue_, imageBuffer_, true, origin, region,
+                              image_row_pitch, image_slice_pitch, bufptr, 0,
+                              NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadImage() failed");
+
+  for (size_t i = 0; i < bufSize_ * bufSize_ * bufSize_ * 4; ++i) {
+    if (bufptr[i] != 1) {
+      printf("(%4dx%4dx%4d) fmt:%s(%1u) checkData() fail, image_ptr[%u] = %d\n",
+             bufSize_, bufSize_, bufSize_, textFormats[fmt_num],
+             formatSize[bufnum_], (unsigned int)i, (int)bufptr[i]);
+      CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
+      char buf[256];
+      SNPRINTF(buf, sizeof(buf),
+               " (%4dx%4dx%4d) fmt:%s(%1d) checkData() FAILED! ", bufSize_,
+               bufSize_, bufSize_, textFormats[fmt_num], formatSize[bufnum_]);
+      testDescString = buf;
+      return;
+    }
+  }
+  delete bufptr;
+
+  // test begins
+  unsigned int numIter = 5;
+
+  timer.Reset();
+  timer.Start();
+
+  for (unsigned int i = 0; i < numIter; ++i) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, kernel_, 3, NULL, gws,
+                                              lws, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+    _wrapper->clFinish(cmd_queue_);
+  }
+
+  timer.Stop();
+
+  double sec = timer.GetElapsedTime();
+
+  // write_image speed in GB/s
+  double perf = ((double)memSize_ * numIter * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%3dx%3dx%3d) fmt:%s(%1u) i: %2d (GB/s) ",
+           bufSize_, bufSize_, bufSize_, textFormats[fmt_num],
+           formatSize[bufnum_], numIter);
+  testDescString = buf;
+}
+
+unsigned int OCLPerf3DImageWriteSpeed::close(void) {
+  if (!skip_) {
+    if (imageBuffer_) {
+      error_ = _wrapper->clReleaseMemObject(imageBuffer_);
+      CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                             "clReleaseMemObject(imageBuffer_) failed");
+    }
+  }
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerf3DImageWriteSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerf3DImageWriteSpeed.h
new file mode 100644
index 0000000000..eb6e9ce12c
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerf3DImageWriteSpeed.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_3DImageWriteSpeed_H_
+#define _OCL_3DImageWriteSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerf3DImageWriteSpeed : public OCLTestImp {
+ public:
+  OCLPerf3DImageWriteSpeed();
+  virtual ~OCLPerf3DImageWriteSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  cl_command_queue cmd_queue_;
+  cl_mem imageBuffer_;
+
+  unsigned int bufSize_;
+  unsigned int bufnum_;
+  char* memptr;
+  unsigned int memSize_;
+  unsigned int testId_;
+
+  bool skip_;
+};
+
+#endif  // _OCL_3DImageWriteSpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAES256.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAES256.cpp
new file mode 100644
index 0000000000..599d2cec33
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAES256.cpp
@@ -0,0 +1,451 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfAES256.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+static const char *aes256_kernel =
+    "// NOTE: THIS KERNEL WAS ADOPTED FROM SISOFT SANDRA: DO NOT "
+    "REDISTRIBUTE!!\n"
+    "inline uint Load(__global uint* pData, const uint iX, const uint iY)\n"
+    "{\n"
+    "   return pData[iX | (iY << 8)];\n"
+    "}\n"
+    "\n"
+    "\n"
+    "inline uint4 Load4(__global uint* pData, const uint4 uX, const uint iY)\n"
+    "{\n"
+    "   uint  uExtent = iY << 8;\n"
+    "   uint4 uNdx = uX + uExtent;\n"
+    "   \n"
+    "   return (uint4)(pData[uNdx.x], pData[uNdx.y], pData[uNdx.z], "
+    "pData[uNdx.w]);\n"
+    "}\n"
+    "\n"
+    "\n"
+    "__kernel \n"
+    "__attribute__((vec_type_hint(uint4))) \n"
+    "void CryptThread(__global uint4* pInput, __global uint4* pOutput,\n"
+    "                       __global uint* pTables,\n"
+    "                       __global uint4* pKey, const uint iRounds)\n"
+    "{\n"
+    "   const uint iNdx = get_global_id(0);\n"
+    "   \n"
+    "   uint4 state, istate, tstate;\n"
+    "   state = pInput[iNdx] ^ pKey[iRounds];\n"
+    "   \n"
+    "   for (uint i = iRounds-1; i; i--)\n"
+    "   {\n"
+    "       istate = state & 0xFF;\n"
+    "       tstate = Load4(pTables, istate.xyzw, 0);\n"
+    "\n"
+    "       istate = (state >> 8) & 0xFF;\n"
+    "       tstate^= Load4(pTables, istate.wxyz, 1);\n"
+    "\n"
+    "       istate = (state >> 16) & 0xFF;\n"
+    "       tstate^= Load4(pTables, istate.zwxy, 2);\n"
+    "\n"
+    "       istate = state >> 24;\n"
+    "       tstate^= Load4(pTables, istate.yzwx, 3);\n"
+    "\n"
+    "       state = tstate ^ pKey[i];\n"
+    "   }\n"
+    "\n"
+    "   istate = state & 0xFF;\n"
+    "   tstate = Load4(pTables, istate.xyzw, 4);\n"
+    "\n"
+    "   istate = (state >> 8) & 0xFF;\n"
+    "   tstate |= Load4(pTables, istate.wxyz, 4) << 8;\n"
+    "\n"
+    "   istate = (state >> 16) & 0xFF;\n"
+    "   tstate |= Load4(pTables, istate.zwxy, 4) << 16;\n"
+    "\n"
+    "   istate = state >> 24;\n"
+    "   tstate |= Load4(pTables, istate.yzwx, 4) << 24;\n"
+    "\n"
+    "   pOutput[iNdx] = tstate ^ pKey[0];\n"
+    "}\n";
+
+static const char *aes256_kernel2 =
+    "// NOTE: THIS KERNEL WAS ADOPTED FROM SISOFT SANDRA: DO NOT "
+    "REDISTRIBUTE!!\n"
+    "#define AES_BLOCK_SIZE      16\n"
+    "#define AES_TABLE_SIZE      256\n"
+    "\n"
+    "#define AES_TABLE_MAX       5\n"
+    "#define AES_CONST_SIZE      (AES_TABLE_SIZE*AES_TABLE_MAX)\n"
+    "\n"
+    "#define AES_ROUND_128       10\n"
+    "#define AES_ROUND_192       12\n"
+    "#define AES_ROUND_256       14\n"
+    "#define AES_ROUNDKEY_MAX    (AES_BLOCK_SIZE/4*(AES_ROUND_256+1))\n"
+    "#define _IS_GPU_\n"
+    "\n"
+    "\n"
+    "inline uint Load(\n"
+    "#ifdef _IS_GPU_\n"
+    "    __local uint* pData,\n"
+    "#else\n"
+    "    __constant uint* pData,\n"
+    "#endif\n"
+    "    const uint iX, const uint iY)\n"
+    "{\n"
+    "    const uint uNdx = iX + iY*AES_TABLE_SIZE;\n"
+    "    return pData[uNdx];\n"
+    "}\n"
+    "\n"
+    "\n"
+    "inline uint4 Load4(\n"
+    "#ifdef _IS_GPU_\n"
+    "    __local uint* pData,\n"
+    "#else\n"
+    "    __constant uint* pData,\n"
+    "#endif\n"
+    "    const uint4 uX, const uint iY)\n"
+    "{\n"
+    "    const uint  uExtent = iY*AES_TABLE_SIZE;\n"
+    "    const uint4 uNdx = uX + uExtent;\n"
+    "    \n"
+    "    return (uint4)(pData[uNdx.x], pData[uNdx.y], pData[uNdx.z], "
+    "pData[uNdx.w]);\n"
+    "}\n"
+    "\n"
+    "\n"
+    "__kernel \n"
+    "__attribute__((vec_type_hint(uint4)))\n"
+    "#ifdef KERNEL_MAX_THREADS\n"
+    "__attribute__((work_group_size_hint(KERNEL_MAX_THREADS, 1, 1)))\n"
+    "#endif\n"
+    "void CryptThread(__global const uint4* pInput, __global uint4* pOutput,\n"
+    "                        __constant uint* pTables,\n"
+    "                        __constant uint4* pKey, const uint iRounds)\n"
+    "{\n"
+    "    const size_t iNdx = get_global_id(0);\n"
+    "\n"
+    "#ifdef _IS_GPU_\n"
+    "    #define Load4T(x, y)    Load4(ulTables, x, y)\n"
+    "\n"
+    "    __local uint  ulTables[AES_CONST_SIZE];\n"
+    "\n"
+    "    const uint iLdx = get_local_id(0);\n"
+    "    if (iLdx < AES_TABLE_SIZE) {\n"
+    "        const uint iGrps = get_local_size(0);\n"
+    "        const uint iLSize = min(iGrps, (uint)AES_TABLE_SIZE);\n"
+    "        const uint iBpL = AES_CONST_SIZE/iLSize;\n"
+    "\n"
+    "        const uint iStart = iLdx*iBpL;\n"
+    "        const uint iEnd   = iStart + iBpL;\n"
+    "\n"
+    "        for (uint i=iStart; i<iEnd; i++) {\n"
+    "            ulTables[i] = pTables[i];\n"
+    "        }\n"
+    "    }\n"
+    "\n"
+    "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "#else\n"
+    "    #define Load4T(x, y)    Load4(pTables, x, y)\n"
+    "#endif\n"
+    "    \n"
+    "    uint4 state, istate, tstate;\n"
+    "    state = pInput[iNdx] ^ pKey[0];\n"
+    "    \n"
+    "    for (uint i = 1; i < iRounds; i++)\n"
+    "    {\n"
+    "        istate = state & 0xFF;\n"
+    "        tstate = Load4T(istate.xyzw, 0);\n"
+    "\n"
+    "        istate = (state >> 8) & 0xFF;\n"
+    "        tstate^= Load4T(istate.yzwx, 1);\n"
+    "\n"
+    "        istate = (state >> 16) & 0xFF;\n"
+    "        tstate^= Load4T(istate.zwxy, 2);\n"
+    "\n"
+    "        istate = state >> 24;\n"
+    "        tstate^= Load4T(istate.wxyz, 3);\n"
+    "\n"
+    "        state = tstate ^ pKey[i];\n"
+    "    }\n"
+    "\n"
+    "    istate = state & 0xFF;\n"
+    "    tstate = Load4T(istate.xyzw, 4);\n"
+    "\n"
+    "    istate = (state >> 8) & 0xFF;\n"
+    "    tstate |= Load4T(istate.yzwx, 4) << 8;\n"
+    "\n"
+    "    istate = (state >> 16) & 0xFF;\n"
+    "    tstate |= Load4T(istate.zwxy, 4) << 16;\n"
+    "\n"
+    "    istate = state >> 24;\n"
+    "    tstate |= Load4T(istate.wxyz, 4) << 24;\n"
+    "\n"
+    "    pOutput[iNdx] = tstate ^ pKey[iRounds];\n"
+    "}\n";
+
+OCLPerfAES256::OCLPerfAES256() { _numSubTests = 2; }
+
+OCLPerfAES256::~OCLPerfAES256() {}
+
+void OCLPerfAES256::setData(cl_mem buffer, unsigned int val) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  for (unsigned int i = 0; i < bufSize_ / sizeof(unsigned int); i++)
+    data[i] = val;
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+  _wrapper->clFinish(cmd_queue_);
+}
+
+void OCLPerfAES256::checkData(cl_mem buffer) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  for (unsigned int i = 0; i < bufSize_ / sizeof(unsigned int); i++) {
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+  _wrapper->clFinish(cmd_queue_);
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfAES256::open(unsigned int test, char *units, double &conversion,
+                         unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  inBuffer_ = 0;
+  outBuffer_ = 0;
+  tableBuffer_ = 0;
+  keyBuffer_ = 0;
+  blockSize_ = 1024;
+  maxIterations = 50;
+
+  bufSize_ = 5592320 * sizeof(cl_uint4);
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0,
+               "Couldn't find platform with GPU devices, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  // Increase iterations for devices with many CUs
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
+                                     sizeof(size_t), &numCUs, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  maxIterations *= (unsigned int)(1 + 10 * numCUs / 20);
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  inBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, bufSize_,
+                                       NULL, &error_);
+  CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed");
+
+  outBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, bufSize_,
+                                        NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  tableBuffer_ =
+      _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, 5120, NULL, &error_);
+  CHECK_RESULT(tableBuffer_ == 0, "clCreateBuffer(tableBuffer) failed");
+
+  keyBuffer_ =
+      _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, 240, NULL, &error_);
+  CHECK_RESULT(keyBuffer_ == 0, "clCreateBuffer(keyBuffer) failed");
+
+  if (_openTest == 0) {
+    program_ = _wrapper->clCreateProgramWithSource(
+        context_, 1, (const char **)&aes256_kernel, NULL, &error_);
+    CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+    testDescString += "orig";
+  } else {
+    program_ = _wrapper->clCreateProgramWithSource(
+        context_, 1, (const char **)&aes256_kernel2, NULL, &error_);
+    CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+    testDescString += " new";
+  }
+
+  const char *buildOps = NULL;
+  error_ = _wrapper->clBuildProgram(program_, 1, &device, buildOps, NULL, NULL);
+
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "CryptThread", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  cl_uint rounds = 14;
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&inBuffer_);
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_);
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_mem),
+                                    (void *)&tableBuffer_);
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_mem), (void *)&keyBuffer_);
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_uint), (void *)&rounds);
+  setData(inBuffer_, 0xdeadbeef);
+  setData(outBuffer_, 0xdeadbeef);
+}
+
+void OCLPerfAES256::run(void) {
+  int global = bufSize_ / sizeof(cl_uint4);
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  CPerfCounter timer;
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < maxIterations; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+  }
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // No idea what data should be in here
+  // checkData(outBuffer_);
+  // Compute GB/s
+  double perf =
+      ((double)bufSize_ * (double)maxIterations * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+}
+
+unsigned int OCLPerfAES256::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+
+  if (inBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(inBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(inBuffer_) failed");
+  }
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (tableBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(tableBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(tableBuffer_) failed");
+  }
+  if (keyBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(keyBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(keyBuffer_) failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAES256.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAES256.h
new file mode 100644
index 0000000000..2d7dc0b22d
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAES256.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_AES256_H_
+#define _OCL_AES256_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfAES256 : public OCLTestImp {
+ public:
+  OCLPerfAES256();
+  virtual ~OCLPerfAES256();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void setData(cl_mem buffer, unsigned int data);
+  void checkData(cl_mem buffer);
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_mem inBuffer_;
+  cl_mem outBuffer_;
+  cl_mem tableBuffer_;
+  cl_mem keyBuffer_;
+  cl_int error_;
+
+  unsigned int width_;
+  unsigned int bufSize_;
+  unsigned int blockSize_;
+  unsigned int maxIterations;
+  size_t numCUs;
+};
+
+#endif  // _OCL_AES256_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed.cpp
new file mode 100644
index 0000000000..af42569224
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed.cpp
@@ -0,0 +1,817 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfAtomicSpeed.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+#include "CL/cl.h"
+#include "OCLPerfAtomicSpeedKernels.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+// Define the test suite tests.
+testOCLPerfAtomicSpeedStruct testOCLPerfAtomicSpeedList[] = {
+    {LocalHistogram, 1},
+    {LocalHistogram, 2},
+    {LocalHistogram, 4},
+    {GlobalHistogram, 1},
+    {GlobalHistogram, 2},
+    {GlobalHistogram, 4},
+    {Global4Histogram, 1},
+    {Global4Histogram, 2},
+    {Global4Histogram, 4},
+    {LocalReductionNoAtomics, 1},
+    {LocalReductionNoAtomics, 2},
+    {LocalReductionNoAtomics, 4},
+    {LocalReductionAtomics, 1},
+    {LocalReductionAtomics, 2},
+    {LocalReductionAtomics, 4},
+    {Local4ReductionNoAtomics, 1},
+    {Local4ReductionNoAtomics, 2},
+    {Local4ReductionNoAtomics, 4},
+    /*    {Local4ReductionAtomics, 1},
+        {Local4ReductionAtomics, 2},
+        {Local4ReductionAtomics, 4},*/
+    {GlobalWGReduction, 1},
+    {GlobalWGReduction, 2},
+    {GlobalWGReduction, 4},
+    {GlobalAllToZeroReduction, 1},
+    {GlobalAllToZeroReduction, 2},
+    {GlobalAllToZeroReduction, 4},
+    {Global4WGReduction, 1},
+    {Global4WGReduction, 2},
+    {Global4WGReduction, 4},
+    {Global4AllToZeroReduction, 1},
+    {Global4AllToZeroReduction, 2},
+    {Global4AllToZeroReduction, 4},
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// OCLPerfAtomicSpeed implementation.
+///////////////////////////////////////////////////////////////////////////////
+OCLPerfAtomicSpeed::OCLPerfAtomicSpeed() {
+  _atomicsSupported = false;
+  _dataSizeTooBig = false;
+  _numSubTests =
+      sizeof(testOCLPerfAtomicSpeedList) / sizeof(testOCLPerfAtomicSpeedStruct);
+  _numLoops = 10;
+  _nCurrentInputScale = 1;
+  _maxMemoryAllocationSize = 0;
+
+  _input = NULL;
+  _output = NULL;
+  _inputBuffer = NULL;
+  _outputBuffer = NULL;
+  _workgroupSize = 256;
+  _programs.clear();
+  _kernels.clear();
+}
+
+OCLPerfAtomicSpeed::~OCLPerfAtomicSpeed() {}
+
+void OCLPerfAtomicSpeed::open(unsigned int test, char *units,
+                              double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_int status = CL_SUCCESS;
+
+  device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+  _cpuReductionSum = 0;
+  _nCurrentInputScale = testOCLPerfAtomicSpeedList[_openTest].inputScale;
+  AtomicType atomicType = testOCLPerfAtomicSpeedList[_openTest].atomicType;
+
+  // Setup stuff...
+  setupHistogram();
+  calculateHostBin();
+
+  context_ = 0;
+  cmd_queue_ = 0;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+    // Get last for default
+#if 0
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+#if 0
+            if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+                platform = platforms[i];
+                break;
+            }
+#endif
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+#if 0
+                if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+                    isAMD = true;
+                }
+#endif
+      platform = platforms[_platformIndex];
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0,
+               "Couldn't find platform with GPU devices, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, NULL, NULL, &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  // Global memory size
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                                     sizeof(cl_ulong),
+                                     &_maxMemoryAllocationSize, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS,
+               "clGetDeviceIDs(CL_DEVICE_GLOBAL_MEM_SIZE) failed");
+
+  // Check that the test size is not too big for the current GPU.
+  _dataSizeTooBig = false;
+  cl_ulong tenMB = 1024 * 10240;
+  if (_inputNBytes >= (_maxMemoryAllocationSize - tenMB)) {
+    _dataSizeTooBig = true;
+    return;
+  }
+
+  char *p = strstr(charbuf, "cl_khr_global_int32_base_atomics");
+  char *p2 = strstr(charbuf, "cl_khr_local_int32_base_atomics");
+
+  _atomicsSupported = false;
+  if (p || p2) _atomicsSupported = true;
+
+  // Verify atomics are supported.
+  if (!_atomicsSupported) return;
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  // Create buffers...
+  _inputBuffer =
+      clCreateBuffer(context_, CL_MEM_READ_ONLY, _inputNBytes, 0, &status);
+  CHECK_RESULT(status, "clCreateBuffer failed. (inputBuffer)");
+
+  // Create the programs/kernels for the current test type.
+  CreateKernels(atomicType);
+
+  _nThreadsPerGroup = _workgroupSize;
+  _nGroups = _nThreads / _nThreadsPerGroup;
+  _outputNBytes = _nGroups * NBINS * sizeof(cl_uint);
+  if (IsReduction(atomicType)) _outputNBytes = _inputNBytes;
+
+  _output = (cl_uint *)malloc(_outputNBytes);
+  if (0 == _output) {
+    _dataSizeTooBig = true;
+    return;
+  }
+
+  // Create output Buffer
+  _outputBuffer =
+      clCreateBuffer(context_, CL_MEM_READ_WRITE, _outputNBytes, 0, &status);
+  CHECK_RESULT(status, "clCreateBuffer failed. (outputBuffer)");
+}
+
+// Create the programs/kernels for the current test type.
+void OCLPerfAtomicSpeed::CreateKernels(const AtomicType atomicType) {
+  char log[16384];
+  cl_kernel kernel_;
+  cl_program program_;
+  char buildOptions[1000];
+  cl_int status = CL_SUCCESS;
+
+  SNPRINTF(buildOptions, sizeof(buildOptions),
+           "-D NBINS=%d -D BITS_PER_PIX=%d -D NBANKS=%d", NBINS, BITS_PER_PIX,
+           NBANKS);
+
+  // Create the programs.
+  switch (atomicType) {
+    case LocalHistogram:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&local_atomics_histogram, NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&local_atomics_reduce, NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    case LocalReductionNoAtomics:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&local_reduction, NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    case Local4ReductionNoAtomics:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&local_vec4_reduction, NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    case LocalReductionAtomics:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&local_atomics_reduction, NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    case Local4ReductionAtomics:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&local_vec4_atomics_reduction, NULL,
+          &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    case GlobalHistogram:
+    case Global4Histogram:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&global_atomics_histogram, NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    case GlobalWGReduction:
+    case Global4WGReduction:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&global_atomics_sum_reduction_workgroup,
+          NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    case GlobalAllToZeroReduction:
+    case Global4AllToZeroReduction:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&global_atomics_sum_reduction_all_to_zero,
+          NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    default:
+      CHECK_RESULT(true, "Atomic type not supported (clCreateProgram)");
+  }
+  // Build the programs.
+  for (size_t i = 0; i < _programs.size(); i++) {
+    error_ = _wrapper->clBuildProgram(_programs[i], 1, &device, buildOptions,
+                                      NULL, NULL);
+    if (error_ != CL_SUCCESS) {
+      status = _wrapper->clGetProgramBuildInfo(_programs[i], device,
+                                               CL_PROGRAM_BUILD_LOG,
+                                               16384 * sizeof(char), log, NULL);
+      printf("Build error -> %s\n", log);
+
+      CHECK_RESULT(0, "clBuildProgram failed");
+    }
+  }
+
+  switch (atomicType) {
+    case LocalHistogram:
+      kernel_ = _wrapper->clCreateKernel(_programs[0],
+                                         "local_atomics_histogram", &error_);
+      CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+      _kernels.push_back(kernel_);
+      kernel_ = _wrapper->clCreateKernel(_programs[1], "local_atomics_reduce",
+                                         &error_);
+      CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+      _kernels.push_back(kernel_);
+      break;
+    case LocalReductionNoAtomics:
+    case Local4ReductionNoAtomics:
+    case LocalReductionAtomics:
+    case Local4ReductionAtomics:
+      kernel_ =
+          _wrapper->clCreateKernel(_programs[0], "local_reduction", &error_);
+      CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+      _kernels.push_back(kernel_);
+      break;
+    case GlobalHistogram:
+    case Global4Histogram:
+      kernel_ = _wrapper->clCreateKernel(_programs[0],
+                                         "global_atomics_histogram", &error_);
+      CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+      _kernels.push_back(kernel_);
+      break;
+    case GlobalWGReduction:
+    case Global4WGReduction:
+      kernel_ = _wrapper->clCreateKernel(
+          _programs[0], "global_atomics_sum_reduction_workgroup", &error_);
+      CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+      _kernels.push_back(kernel_);
+      break;
+    case GlobalAllToZeroReduction:
+    case Global4AllToZeroReduction:
+      kernel_ = _wrapper->clCreateKernel(
+          _programs[0], "global_atomics_sum_reduction_all_to_zero", &error_);
+      CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+      _kernels.push_back(kernel_);
+      break;
+    default:
+      CHECK_RESULT(true, "Atomic type not supported (clCreateKernel)");
+  }
+}
+
+// Sets the kernel arguments based on the current test type.
+void OCLPerfAtomicSpeed::SetKernelArguments(const AtomicType atomicType) {
+  int Arg = 0;
+  int localSize = 0;
+  int itemsPerThread = 1;
+  cl_int status = CL_SUCCESS;
+
+  switch (atomicType) {
+    case LocalHistogram:
+      // Set arguments for the local atomics histogram kernel
+      status = _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
+                                        (void *)&_inputBuffer);
+      CHECK_RESULT(status, "clSetKernelArg failed. (inputBuffer)");
+
+      status |= _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
+                                         (void *)&_outputBuffer);
+      CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)");
+
+      status |= _wrapper->clSetKernelArg(_kernels[0], Arg++,
+                                         sizeof(_n4VectorsPerThread),
+                                         (void *)&_n4VectorsPerThread);
+      CHECK_RESULT(status, "clSetKernelArg failed. (n4VectorsPerThread)");
+
+      // Set arguments for the local atomics reduce kernel
+      Arg = 0;
+      status |= _wrapper->clSetKernelArg(_kernels[1], Arg++, sizeof(cl_mem),
+                                         (void *)&_outputBuffer);
+      CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)");
+
+      status |= _wrapper->clSetKernelArg(_kernels[1], Arg++, sizeof(_nGroups),
+                                         (void *)&_nGroups);
+      CHECK_RESULT(status, "clSetKernelArg failed. (nGroups)");
+      break;
+    case LocalReductionAtomics:
+    case LocalReductionNoAtomics:
+    case Local4ReductionNoAtomics:
+    case Local4ReductionAtomics:
+      status = _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
+                                        (void *)&_inputBuffer);
+      CHECK_RESULT(status, "clSetKernelArg failed. (inputBuffer)");
+
+      status |= _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
+                                         (void *)&_outputBuffer);
+      CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)");
+
+      localSize = DEFAULT_WG_SIZE * sizeof(cl_uint);
+      if ((Local4ReductionNoAtomics == atomicType) ||
+          (Local4ReductionAtomics == atomicType))
+        localSize *= 4;
+      status = _wrapper->clSetKernelArg(_kernels[0], Arg++, localSize, NULL);
+      CHECK_RESULT(status, "clSetKernelArg failed. (local memory)");
+      break;
+    case GlobalHistogram:
+    case Global4Histogram:
+    case GlobalWGReduction:
+    case Global4WGReduction:
+    case GlobalAllToZeroReduction:
+    case Global4AllToZeroReduction:
+      // Set arguments for the global atomics histogram kernel
+      if ((Global4Histogram == atomicType) ||
+          (Global4WGReduction == atomicType) ||
+          (Global4AllToZeroReduction == atomicType))
+        itemsPerThread = 4;
+
+      status = _wrapper->clSetKernelArg(
+          _kernels[0], Arg++, sizeof(itemsPerThread), (void *)&itemsPerThread);
+      CHECK_RESULT(status, "clSetKernelArg failed. (itemsPerThread)");
+
+      status = _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
+                                        (void *)&_inputBuffer);
+      CHECK_RESULT(status, "clSetKernelArg failed. (inputBuffer)");
+
+      status |= _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
+                                         (void *)&_outputBuffer);
+      CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)");
+      break;
+    default:
+      CHECK_RESULT(true, "Atomic type not supported (clSetKernelArg)");
+  }
+}
+
+// Since we write multiple times to the output in global atomics, need to
+// reset the content every time.
+void OCLPerfAtomicSpeed::ResetGlobalOutput() {
+  cl_int status;
+
+  memset(_output, 0, _outputNBytes);
+
+  status =
+      _wrapper->clEnqueueWriteBuffer(cmd_queue_, _outputBuffer, CL_TRUE, 0,
+                                     _outputNBytes, _output, 0, NULL, NULL);
+  CHECK_RESULT(status, "clEnqueueWriteBuffer failed.");
+
+  status = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+}
+
+// Run the local histogram kernels.
+void OCLPerfAtomicSpeed::RunLocalHistogram() {
+  cl_uint status;
+  cl_event events[2];
+  size_t globalThreads[3] = {1};
+  size_t localThreads[3] = {1};
+  size_t globalThreadsReduce = NBINS;
+  size_t localThreadsReduce = _nThreadsPerGroup;
+
+  globalThreads[0] = _nThreads;
+  localThreads[0] = _nThreadsPerGroup;
+
+  status = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, _kernels[0], 1, NULL,
+                                            globalThreads, localThreads, 0,
+                                            NULL, &events[0]);
+  CHECK_RESULT(status, "clEnqueueNDRangeKernel failed. (histogram)");
+
+  status = _wrapper->clEnqueueNDRangeKernel(
+      cmd_queue_, _kernels[1], 1, NULL, &globalThreadsReduce,
+      &localThreadsReduce, 1, &events[0], &events[1]);
+  CHECK_RESULT(status, "clEnqueueNDRangeKernel failed. (reduce)");
+
+  status = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+
+  status = _wrapper->clWaitForEvents(1, &events[0]);
+  status |= _wrapper->clWaitForEvents(1, &events[1]);
+  CHECK_RESULT(status, "clWaitForEvents failed.");
+}
+
+// Run the local reduction kernel.
+void OCLPerfAtomicSpeed::RunLocalReduction(const AtomicType atomicType) {
+  cl_uint status;
+  size_t globalThreads[3] = {1};
+  size_t localThreads[3] = {1};
+
+  globalThreads[0] = _inputNBytes / sizeof(cl_uint) / 2;
+  localThreads[0] = _nThreadsPerGroup;
+  if ((Local4ReductionNoAtomics == atomicType) ||
+      (Local4ReductionAtomics == atomicType))
+    globalThreads[0] /= 4;
+
+  status = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, _kernels[0], 1, NULL,
+                                            globalThreads, localThreads, 0,
+                                            NULL, NULL);
+  CHECK_RESULT(status, "clEnqueueNDRangeKernel failed. (reduction)");
+
+  status = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+}
+
+// Run the global histogram kernel.
+void OCLPerfAtomicSpeed::RunGlobalHistogram(AtomicType atomicType) {
+  cl_uint status;
+  size_t globalThreads[3] = {1};
+  size_t localThreads[3] = {1};
+
+  globalThreads[0] = _inputNBytes / sizeof(cl_uint);
+  localThreads[0] = _nThreadsPerGroup;
+
+  if ((Global4Histogram == atomicType) || (Global4WGReduction == atomicType) ||
+      (Global4AllToZeroReduction == atomicType))
+    globalThreads[0] /= 4;
+
+  status = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, _kernels[0], 1, NULL,
+                                            globalThreads, localThreads, 0,
+                                            NULL, NULL);
+  CHECK_RESULT(status, "clEnqueueNDRangeKernel failed.");
+
+  status = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+}
+
+// Run the AtomicSpeed logic.
+void OCLPerfAtomicSpeed::run() {
+  int Arg = 0;
+  cl_uint status;
+  AtomicType atomicType = testOCLPerfAtomicSpeedList[_openTest].atomicType;
+
+  // Verify atomics are supported.
+  if ((!_atomicsSupported) || (_dataSizeTooBig)) return;
+
+  // Write data to the GPU
+  status = _wrapper->clEnqueueWriteBuffer(cmd_queue_, _inputBuffer, CL_FALSE, 0,
+                                          _inputNBytes, _input, 0, NULL, NULL);
+  CHECK_RESULT(status, "clEnqueueWriteBuffer failed. (inputBuffer)");
+
+  status = _wrapper->clFlush(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+
+  // Set the current arguments based on the test type.
+  SetKernelArguments(atomicType);
+
+  // Run the kernels.
+  CPerfCounter timer;
+  double totalTime = 0.0f;
+
+  for (unsigned int k = 0; k < _numLoops + 1; k++) {
+    // Since we run multiple times using global atomics the output
+    // would get accumulated therefore first clean it.
+    ResetGlobalOutput();
+
+    timer.Reset();
+    timer.Start();
+    switch (atomicType) {
+      case LocalHistogram:
+        RunLocalHistogram();
+        break;
+      case LocalReductionAtomics:
+      case LocalReductionNoAtomics:
+      case Local4ReductionNoAtomics:
+      case Local4ReductionAtomics:
+        RunLocalReduction(atomicType);
+        break;
+      case GlobalHistogram:
+      case Global4Histogram:
+      case GlobalWGReduction:
+      case Global4WGReduction:
+      case GlobalAllToZeroReduction:
+      case Global4AllToZeroReduction:
+        RunGlobalHistogram(atomicType);
+        break;
+      default:
+        CHECK_RESULT(true, "Atomic type not supported");
+    }
+    timer.Stop();
+    // Don't count the warm-up
+    if (0 != k) totalTime += timer.GetElapsedTime();
+  }
+
+  // Read the results back to the CPU - Only do it for the last run
+  // of the test instead of for each iteration of _numLoops.
+  status = _wrapper->clEnqueueReadBuffer(cmd_queue_, _outputBuffer, CL_FALSE, 0,
+                                         _outputNBytes, _output, 0, NULL, NULL);
+  CHECK_RESULT(status, "clEnqueueReadBuffer failed.");
+  status = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+
+  // Print the results.
+  PrintResults(atomicType, totalTime);
+
+  // Check the results for the current test.
+  _errorFlag = !(VerifyResults(atomicType));
+}
+
+// Compare the results and see if they match
+bool OCLPerfAtomicSpeed::VerifyResults(const AtomicType atomicType) {
+  cl_uint i = 0;
+  bool flag = true;
+  cl_uint calculatedValue = 0;
+  cl_uint reductionElementCount = 0;
+  switch (atomicType) {
+    case LocalHistogram:
+    case GlobalHistogram:
+    case Global4Histogram:
+      for (i = 0; i < NBINS; ++i) {
+        if (_cpuhist[i] != _output[i]) {
+          flag = false;
+          break;
+        }
+      }
+      break;
+    case LocalReductionAtomics:
+    case LocalReductionNoAtomics:
+    case Local4ReductionNoAtomics:
+    case Local4ReductionAtomics:
+    case GlobalWGReduction:
+    case Global4WGReduction:
+      reductionElementCount =
+          _inputNBytes / sizeof(cl_uint) / _nThreadsPerGroup;
+      for (i = 0; i < reductionElementCount; i++) {
+        calculatedValue += _output[i];
+      }
+      flag = (calculatedValue == _cpuReductionSum);
+      break;
+    case GlobalAllToZeroReduction:
+    case Global4AllToZeroReduction:
+      flag = (_output[0] == _cpuReductionSum);
+      break;
+    default:
+      CHECK_RESULT_NO_RETURN(true, "Atomic type not supported (VerifyResults)");
+      return false;
+  }
+  if (!flag) printf("WRONG VALUES!!!!!");
+  return flag;
+}
+
+unsigned int OCLPerfAtomicSpeed::close() {
+  size_t i = 0;
+  for (; i < _kernels.size(); i++) {
+    error_ = _wrapper->clReleaseKernel(_kernels[i]);
+  }
+  for (; i < _programs.size(); i++) {
+    error_ = _wrapper->clReleaseProgram(_programs[i]);
+  }
+  if (_inputBuffer) {
+    error_ = clReleaseMemObject(_inputBuffer);
+    CHECK_RESULT_NO_RETURN(error_, "clReleaseMemObject failed.(inputBuffer )");
+  }
+  if (_outputBuffer) {
+    error_ = clReleaseMemObject(_outputBuffer);
+    CHECK_RESULT_NO_RETURN(error_, "clReleaseMemObject failed.(outputBuffer)");
+  }
+
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  // Free host memory.
+  free(_input);
+  free(_output);
+
+  // Reset everything.
+  _kernels.clear();
+  _programs.clear();
+  _inputBuffer = NULL;
+  _outputBuffer = NULL;
+  cmd_queue_ = NULL;
+  context_ = NULL;
+  _input = NULL;
+  _output = NULL;
+
+  return _crcword;
+}
+
+/* Helper functions */
+void OCLPerfAtomicSpeed::calculateHostBin() {
+  // compute CPU histogram
+  cl_int *p = (cl_int *)_input;
+  memset(_cpuhist, 0, NBINS * sizeof(cl_uint));
+  _cpuReductionSum = 0;
+
+  for (unsigned int i = 0; i < _inputNBytes / sizeof(cl_uint); i++) {
+    _cpuhist[(p[i] >> 24) & 0xff]++;
+    _cpuhist[(p[i] >> 16) & 0xff]++;
+    _cpuhist[(p[i] >> 8) & 0xff]++;
+    _cpuhist[(p[i] >> 0) & 0xff]++;
+    _cpuReductionSum += ((p[i] >> 24) & 0x3) + ((p[i] >> 16) & 0x3) +
+                        ((p[i] >> 8) & 0x3) + ((p[i] >> 0) & 0x3);
+  }
+}
+
+void OCLPerfAtomicSpeed::setupHistogram() {
+  cl_int status = 0;
+
+  _nThreads = 64 * 1024;
+#if defined(_WIN32) && !defined(_WIN64)
+  _n4Vectors = 1024 * 1024;
+#else
+  _n4Vectors = 2048 * 2048;
+#endif
+  _n4Vectors *= _nCurrentInputScale;
+  _n4VectorsPerThread = _n4Vectors / _nThreads;
+  _inputNBytes = _n4Vectors * sizeof(cl_uint4);
+
+  _input = (cl_uint *)malloc(_inputNBytes);
+  if (0 == _input) {
+    _dataSizeTooBig = true;
+    return;
+  }
+
+  // random initialization of input
+  time_t ltime;
+  time(&ltime);
+  cl_uint a = (cl_uint)ltime, b = (cl_uint)ltime;
+  cl_uint *p = (cl_uint *)_input;
+
+  for (unsigned int i = 0; i < _inputNBytes / sizeof(cl_uint); i++)
+    p[i] = (b = (a * (b & 65535)) + (b >> 16));
+}
+
+// Print the results of the current test.
+void OCLPerfAtomicSpeed::PrintResults(const AtomicType atomicType,
+                                      double totalTime) {
+  char buf[500];
+  char sAtomicType[100];
+  double inputInGB = (double)_inputNBytes * (double)(1e-09);
+  // each cl_uint in _inputNBytes contributes 4 items.
+  double totalHistogramDataInGB = (double)inputInGB * 4;
+  double perf = totalTime / _numLoops;
+
+  switch (atomicType) {
+    case LocalHistogram:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType), "Local histogram");
+      break;
+    case GlobalHistogram:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType), "Global histogram");
+      break;
+    case Global4Histogram:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType), "Global vec 4 histogram");
+      break;
+    case LocalReductionNoAtomics:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType), "Local reduction NO atomics");
+      break;
+    case Local4ReductionNoAtomics:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType),
+               "Local vec 4 reduction NO atomics");
+      break;
+    case LocalReductionAtomics:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType),
+               "Local reduction with atomics");
+      break;
+    case Local4ReductionAtomics:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType),
+               "Local vec 4 reduction with atomics");
+      break;
+    case GlobalWGReduction:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType), "Global work-group reduction");
+      break;
+    case Global4WGReduction:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType),
+               "Global vec 4 work-group reduction");
+      break;
+    case GlobalAllToZeroReduction:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType),
+               "Global all to zero reduction");
+      break;
+    case Global4AllToZeroReduction:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType),
+               "Global vec 4 all to zero reduction");
+      break;
+    default:
+      CHECK_RESULT(true, "Atomic type not supported (PrintResults)");
+  }
+
+  SNPRINTF(buf, sizeof(buf), "%45s: Input [%.3f GB], Time [%.3f sec]: GB/s",
+           sAtomicType, totalHistogramDataInGB, perf);
+  _perfInfo = (float)(totalHistogramDataInGB / perf);
+  testDescString = buf;
+}
+
+bool OCLPerfAtomicSpeed::IsReduction(const AtomicType atomicType) {
+  return ((atomicType >= LocalReductionNoAtomics) &&
+          (atomicType <= GlobalAllToZeroReduction));
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed.h
new file mode 100644
index 0000000000..1a94512866
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed.h
@@ -0,0 +1,119 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_AtomicSpeed_H_
+#define _OCL_AtomicSpeed_H_
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "OCLTestImp.h"
+
+#define DEFAULT_WG_SIZE 256
+#define NBINS 256
+#define BITS_PER_PIX 8
+#define NBANKS 16
+
+// Define the atomic type to test.
+enum AtomicType {
+  LocalHistogram = 0,
+  GlobalHistogram,
+  Global4Histogram,
+  LocalReductionNoAtomics,
+  Local4ReductionNoAtomics,
+  LocalReductionAtomics,
+  Local4ReductionAtomics,
+  GlobalWGReduction,
+  Global4WGReduction,
+  GlobalAllToZeroReduction,
+  Global4AllToZeroReduction,
+};
+
+typedef struct {
+  AtomicType atomicType;
+  int inputScale;
+} testOCLPerfAtomicSpeedStruct;
+
+// Define the OCLPerfAtomicSpeed class.
+class OCLPerfAtomicSpeed : public OCLTestImp {
+ public:
+  OCLPerfAtomicSpeed();
+  virtual ~OCLPerfAtomicSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  std::vector<cl_program> _programs;
+  std::vector<cl_kernel> _kernels;
+  cl_device_id device;
+
+  bool _atomicsSupported;
+  bool _dataSizeTooBig;
+  cl_uint _numLoops;
+
+  // Histogram related stuff...
+ private:
+  cl_ulong _maxMemoryAllocationSize;
+  cl_uint _inputNBytes;
+  cl_uint _outputNBytes;
+
+  cl_uint _nCurrentInputScale;
+  cl_uint _workgroupSize;
+  //    cl_uint nLoops;
+  cl_uint _nThreads;
+  cl_uint _nThreadsPerGroup;
+  cl_uint _nGroups;
+  cl_uint _n4Vectors;
+  cl_uint _n4VectorsPerThread;
+  cl_uint _nBins;
+  cl_uint _nBytesLDSPerGrp;
+
+  cl_uint* _input;
+  cl_uint* _output;
+  cl_mem _inputBuffer;
+  cl_mem _outputBuffer;
+
+  cl_uint _cpuhist[NBINS];
+  cl_uint _cpuReductionSum;
+
+  void calculateHostBin();
+  void setupHistogram();
+  bool VerifyResults(const AtomicType atomicType);
+  void ResetGlobalOutput();
+
+  // Methods that does the actual NDRange.
+  void RunLocalHistogram();
+  void RunLocalReduction(const AtomicType atomicType);
+  void RunGlobalHistogram(const AtomicType atomicType);
+
+  void CreateKernels(const AtomicType atomicType);
+  bool IsReduction(const AtomicType atomicType);
+  void SetKernelArguments(const AtomicType atomicType);
+  void PrintResults(const AtomicType atomicType, double totalTime);
+};
+
+#endif  // _OCL_AtomicSpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20.cpp
new file mode 100644
index 0000000000..cf7716dfe8
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20.cpp
@@ -0,0 +1,509 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfAtomicSpeed20.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+#include "CL/cl.h"
+#include "OCLPerfAtomicSpeed20Kernels.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+// Define the test suite tests.
+testOCLPerfAtomicSpeed20Struct testOCLPerfAtomicSpeed20List[] = {
+    {GlobalWGReduction, 1},         {GlobalWGReduction, 2},
+    {GlobalWGReduction, 4},         {GlobalAllToZeroReduction, 1},
+    {GlobalAllToZeroReduction, 2},  {GlobalAllToZeroReduction, 4},
+    {Global4WGReduction, 1},        {Global4WGReduction, 2},
+    {Global4WGReduction, 4},        {Global4AllToZeroReduction, 1},
+    {Global4AllToZeroReduction, 2}, {Global4AllToZeroReduction, 4},
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// OCLPerfAtomicSpeed20 implementation.
+///////////////////////////////////////////////////////////////////////////////
+OCLPerfAtomicSpeed20::OCLPerfAtomicSpeed20() {
+  _atomicsSupported = false;
+  _dataSizeTooBig = false;
+  _numSubTests = sizeof(testOCLPerfAtomicSpeed20List) /
+                 sizeof(testOCLPerfAtomicSpeed20Struct);
+  _numLoops = 10;
+  _nCurrentInputScale = 1;
+  _maxMemoryAllocationSize = 0;
+
+  _input = NULL;
+  _output = NULL;
+  _inputBuffer = NULL;
+  _outputBuffer = NULL;
+
+  skip_ = false;
+
+  _workgroupSize = 256;
+  _programs.clear();
+  _kernels.clear();
+}
+
+OCLPerfAtomicSpeed20::~OCLPerfAtomicSpeed20() {}
+
+void OCLPerfAtomicSpeed20::open(unsigned int test, char *units,
+                                double &conversion, unsigned int deviceId) {
+  error_ = CL_SUCCESS;
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  program_ = 0;
+  kernel_ = 0;
+
+#if defined(CL_VERSION_2_0)
+  cl_device_id device;
+  cl_int status = CL_SUCCESS;
+
+  conversion = 1.0f;
+  _openTest = test;
+  _cpuReductionSum = 0;
+  _nCurrentInputScale = testOCLPerfAtomicSpeed20List[_openTest].inputScale;
+  AtomicType atomicType = testOCLPerfAtomicSpeed20List[_openTest].atomicType;
+
+  // Setup stuff...
+  setupHistogram();
+  calculateHostBin();
+
+  device = devices_[_deviceId];
+
+  cmd_queue_ = cmdQueues_[_deviceId];
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  // Global memory size
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                                     sizeof(cl_ulong),
+                                     &_maxMemoryAllocationSize, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS,
+               "clGetDeviceInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE) failed");
+
+  // Check that the test size is not too big for the current GPU.
+  _dataSizeTooBig = false;
+  cl_ulong tenMB = 1024 * 10240;
+  if (_inputNBytes >= (_maxMemoryAllocationSize - tenMB)) {
+    _dataSizeTooBig = true;
+    return;
+  }
+
+  char *p = strstr(charbuf, "cl_khr_global_int32_base_atomics");
+
+  _atomicsSupported = false;
+  if (p) _atomicsSupported = true;
+
+  // Verify atomics are supported.
+  if (!_atomicsSupported) return;
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  // Create buffers...
+  _inputBuffer =
+      clCreateBuffer(context_, CL_MEM_READ_ONLY, _inputNBytes, 0, &status);
+  CHECK_RESULT(status, "clCreateBuffer failed. (inputBuffer)");
+
+  // Create the programs/kernels for the current test type.
+  CreateKernels(atomicType);
+
+  _nThreadsPerGroup = _workgroupSize;
+  _nGroups = _nThreads / _nThreadsPerGroup;
+  _outputNBytes = _inputNBytes;
+
+  _output = (cl_uint *)malloc(_outputNBytes);
+  if (0 == _output) {
+    _dataSizeTooBig = true;
+    return;
+  }
+
+  // Create output Buffer
+  _outputBuffer =
+      clCreateBuffer(context_, CL_MEM_READ_WRITE, _outputNBytes, 0, &status);
+  CHECK_RESULT(status, "clCreateBuffer failed. (outputBuffer)");
+#else
+  skip_ = true;
+  testDescString = "OpenCL verion < 2.0. Test Skipped.";
+  return;
+#endif
+}
+
+// Create the programs/kernels for the current test type.
+void OCLPerfAtomicSpeed20::CreateKernels(const AtomicType atomicType) {
+  char log[16384];
+  cl_kernel kernel_;
+  cl_program program_;
+  char buildOptions[1000];
+  cl_int status = CL_SUCCESS;
+  cl_device_id device = devices_[_deviceId];
+
+  SNPRINTF(buildOptions, sizeof(buildOptions),
+           "-cl-std=CL2.0 -D NBINS=%d -D BITS_PER_PIX=%d -D NBANKS=%d", NBINS,
+           BITS_PER_PIX, NBANKS);
+
+  // Create the programs.
+  switch (atomicType) {
+    case GlobalWGReduction:
+    case Global4WGReduction:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&global_atomics_sum_reduction_workgroup,
+          NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    case GlobalAllToZeroReduction:
+    case Global4AllToZeroReduction:
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, (const char **)&global_atomics_sum_reduction_all_to_zero,
+          NULL, &error_);
+      CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+      _programs.push_back(program_);
+      break;
+    default:
+      CHECK_RESULT(true, "Atomic type not supported (clCreateProgram)");
+  }
+  // Build the programs.
+  for (size_t i = 0; i < _programs.size(); i++) {
+    error_ = _wrapper->clBuildProgram(_programs[i], 1, &device, buildOptions,
+                                      NULL, NULL);
+    if (error_ != CL_SUCCESS) {
+      status = _wrapper->clGetProgramBuildInfo(_programs[i], device,
+                                               CL_PROGRAM_BUILD_LOG,
+                                               16384 * sizeof(char), log, NULL);
+      printf("Build error -> %s\n", log);
+
+      CHECK_RESULT(0, "clBuildProgram failed");
+    }
+  }
+
+  switch (atomicType) {
+    case GlobalWGReduction:
+    case Global4WGReduction:
+      kernel_ = _wrapper->clCreateKernel(
+          _programs[0], "global_atomics_sum_reduction_workgroup", &error_);
+      CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+      _kernels.push_back(kernel_);
+      break;
+    case GlobalAllToZeroReduction:
+    case Global4AllToZeroReduction:
+      kernel_ = _wrapper->clCreateKernel(
+          _programs[0], "global_atomics_sum_reduction_all_to_zero", &error_);
+      CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+      _kernels.push_back(kernel_);
+      break;
+    default:
+      CHECK_RESULT(true, "Atomic type not supported (clCreateKernel)");
+  }
+}
+
+// Sets the kernel arguments based on the current test type.
+void OCLPerfAtomicSpeed20::SetKernelArguments(const AtomicType atomicType) {
+  int Arg = 0;
+  int localSize = 0;
+  int itemsPerThread = 1;
+  cl_int status = CL_SUCCESS;
+
+  switch (atomicType) {
+    case GlobalWGReduction:
+    case Global4WGReduction:
+    case GlobalAllToZeroReduction:
+    case Global4AllToZeroReduction:
+      // Set arguments for the global atomics histogram kernel
+      if ((Global4WGReduction == atomicType) ||
+          (Global4AllToZeroReduction == atomicType))
+        itemsPerThread = 4;
+
+      status = _wrapper->clSetKernelArg(
+          _kernels[0], Arg++, sizeof(itemsPerThread), (void *)&itemsPerThread);
+      CHECK_RESULT(status, "clSetKernelArg failed. (itemsPerThread)");
+
+      status = _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
+                                        (void *)&_inputBuffer);
+      CHECK_RESULT(status, "clSetKernelArg failed. (inputBuffer)");
+
+      status |= _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
+                                         (void *)&_outputBuffer);
+      CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)");
+      break;
+    default:
+      CHECK_RESULT(true, "Atomic type not supported (clSetKernelArg)");
+  }
+}
+
+// Since we write multiple times to the output in global atomics, need to
+// reset the content every time.
+void OCLPerfAtomicSpeed20::ResetGlobalOutput() {
+  cl_int status;
+
+  memset(_output, 0, _outputNBytes);
+
+  status =
+      _wrapper->clEnqueueWriteBuffer(cmd_queue_, _outputBuffer, CL_TRUE, 0,
+                                     _outputNBytes, _output, 0, NULL, NULL);
+  CHECK_RESULT(status, "clEnqueueWriteBuffer failed.");
+
+  status = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+}
+
+// Run the global histogram kernel.
+void OCLPerfAtomicSpeed20::RunGlobalHistogram(AtomicType atomicType) {
+  cl_uint status;
+  size_t globalThreads[3] = {1};
+  size_t localThreads[3] = {1};
+
+  globalThreads[0] = _inputNBytes / sizeof(cl_uint);
+  localThreads[0] = _nThreadsPerGroup;
+
+  if ((Global4WGReduction == atomicType) ||
+      (Global4AllToZeroReduction == atomicType))
+    globalThreads[0] /= 4;
+
+  status = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, _kernels[0], 1, NULL,
+                                            globalThreads, localThreads, 0,
+                                            NULL, NULL);
+  CHECK_RESULT(status, "clEnqueueNDRangeKernel failed.");
+
+  status = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+}
+
+// Run the AtomicSpeed logic.
+void OCLPerfAtomicSpeed20::run() {
+  if (skip_) {
+    return;
+  }
+
+#if defined(CL_VERSION_2_0)
+  int Arg = 0;
+  cl_uint status;
+  AtomicType atomicType = testOCLPerfAtomicSpeed20List[_openTest].atomicType;
+
+  // Verify atomics are supported.
+  if ((!_atomicsSupported) || (_dataSizeTooBig)) return;
+
+  // Write data to the GPU
+  status = _wrapper->clEnqueueWriteBuffer(cmd_queue_, _inputBuffer, CL_FALSE, 0,
+                                          _inputNBytes, _input, 0, NULL, NULL);
+  CHECK_RESULT(status, "clEnqueueWriteBuffer failed. (inputBuffer)");
+
+  status = _wrapper->clFlush(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+
+  // Set the current arguments based on the test type.
+  SetKernelArguments(atomicType);
+
+  // Run the kernels.
+  CPerfCounter timer;
+  double totalTime = 0.0f;
+
+  for (unsigned int k = 0; k < _numLoops + 1; k++) {
+    // Since we run multiple times using global atomics the output
+    // would get accumulated therefore first clean it.
+    ResetGlobalOutput();
+
+    timer.Reset();
+    timer.Start();
+    switch (atomicType) {
+      case GlobalWGReduction:
+      case Global4WGReduction:
+      case GlobalAllToZeroReduction:
+      case Global4AllToZeroReduction:
+        RunGlobalHistogram(atomicType);
+        break;
+      default:
+        CHECK_RESULT(true, "Atomic type not supported");
+    }
+    timer.Stop();
+    // Don't count the warm-up
+    if (0 != k) totalTime += timer.GetElapsedTime();
+  }
+
+  status = _wrapper->clEnqueueReadBuffer(cmd_queue_, _outputBuffer, CL_FALSE, 0,
+                                         _outputNBytes, _output, 0, NULL, NULL);
+  CHECK_RESULT(status, "clEnqueueReadBuffer failed.");
+  status = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(status, "clFlush failed.");
+
+  // Print the results.
+  PrintResults(atomicType, totalTime);
+
+  // Check the results for the current test.
+  _errorFlag = !(VerifyResults(atomicType));
+#endif
+}
+
+// Compare the results and see if they match
+bool OCLPerfAtomicSpeed20::VerifyResults(const AtomicType atomicType) {
+  cl_uint i = 0;
+  bool flag = true;
+  cl_uint calculatedValue = 0;
+  cl_uint reductionElementCount = 0;
+  switch (atomicType) {
+    case GlobalWGReduction:
+    case Global4WGReduction:
+      reductionElementCount =
+          _inputNBytes / sizeof(cl_uint) / _nThreadsPerGroup;
+      for (i = 0; i < reductionElementCount; i++) {
+        calculatedValue += _output[i];
+      }
+      flag = (calculatedValue == _cpuReductionSum);
+      break;
+    case GlobalAllToZeroReduction:
+    case Global4AllToZeroReduction:
+      flag = (_output[0] == _cpuReductionSum);
+      break;
+    default:
+      CHECK_RESULT_NO_RETURN(true, "Atomic type not supported (VerifyResults)");
+      return false;
+  }
+  if (!flag) printf("WRONG VALUES!!!!!");
+  return flag;
+}
+
+unsigned int OCLPerfAtomicSpeed20::close() {
+  size_t i = 0;
+  for (; i < _kernels.size(); i++) {
+    error_ = _wrapper->clReleaseKernel(_kernels[i]);
+  }
+  for (; i < _programs.size(); i++) {
+    error_ = _wrapper->clReleaseProgram(_programs[i]);
+  }
+
+  if (_inputBuffer) {
+    error_ = clReleaseMemObject(_inputBuffer);
+    CHECK_RESULT_NO_RETURN(error_, "clReleaseMemObject failed.(inputBuffer )");
+  }
+  if (_outputBuffer) {
+    error_ = clReleaseMemObject(_outputBuffer);
+    CHECK_RESULT_NO_RETURN(error_, "clReleaseMemObject failed.(outputBuffer)");
+  }
+
+  // Free host memory.
+  free(_input);
+  free(_output);
+
+  // Reset everything.
+  _kernels.clear();
+  _programs.clear();
+
+  _inputBuffer = NULL;
+  _outputBuffer = NULL;
+
+  _input = NULL;
+  _output = NULL;
+
+  return OCLTestImp::close();
+}
+
+/* Helper functions */
+void OCLPerfAtomicSpeed20::calculateHostBin() {
+  // compute CPU histogram
+  cl_int *p = (cl_int *)_input;
+  memset(_cpuhist, 0, NBINS * sizeof(cl_uint));
+  _cpuReductionSum = 0;
+
+  for (unsigned int i = 0; i < _inputNBytes / sizeof(cl_uint); i++) {
+    _cpuhist[(p[i] >> 24) & 0xff]++;
+    _cpuhist[(p[i] >> 16) & 0xff]++;
+    _cpuhist[(p[i] >> 8) & 0xff]++;
+    _cpuhist[(p[i] >> 0) & 0xff]++;
+    _cpuReductionSum += ((p[i] >> 24) & 0x3) + ((p[i] >> 16) & 0x3) +
+                        ((p[i] >> 8) & 0x3) + ((p[i] >> 0) & 0x3);
+  }
+}
+
+void OCLPerfAtomicSpeed20::setupHistogram() {
+  cl_int status = 0;
+
+  _nThreads = 64 * 1024;
+  _n4Vectors = 2048 * 2048;
+  _n4Vectors *= _nCurrentInputScale;
+  _n4VectorsPerThread = _n4Vectors / _nThreads;
+  _inputNBytes = _n4Vectors * sizeof(cl_uint4);
+
+  _input = (cl_uint *)malloc(_inputNBytes);
+  if (0 == _input) {
+    _dataSizeTooBig = true;
+    return;
+  }
+
+  // random initialization of input
+  time_t ltime;
+  time(&ltime);
+  cl_uint a = (cl_uint)ltime, b = (cl_uint)ltime;
+  cl_uint *p = (cl_uint *)_input;
+
+  for (unsigned int i = 0; i < _inputNBytes / sizeof(cl_uint); i++)
+    p[i] = (b = (a * (b & 65535)) + (b >> 16));
+}
+
+// Print the results of the current test.
+void OCLPerfAtomicSpeed20::PrintResults(const AtomicType atomicType,
+                                        double totalTime) {
+  char buf[500];
+  char sAtomicType[100];
+  double inputInGB = (double)_inputNBytes * (double)(1e-09);
+  // each cl_uint in _inputNBytes contributes 4 items.
+  double totalHistogramDataInGB = (double)inputInGB * 4;
+  double perf = totalTime / _numLoops;
+
+  switch (atomicType) {
+    case GlobalWGReduction:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType), "Global work-group reduction");
+      break;
+    case Global4WGReduction:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType),
+               "Global vec 4 work-group reduction");
+      break;
+    case GlobalAllToZeroReduction:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType),
+               "Global all to zero reduction");
+      break;
+    case Global4AllToZeroReduction:
+      SNPRINTF(sAtomicType, sizeof(sAtomicType),
+               "Global vec 4 all to zero reduction");
+      break;
+    default:
+      CHECK_RESULT(true, "Atomic type not supported (PrintResults)");
+  }
+
+  SNPRINTF(buf, sizeof(buf), "%45s: Input [%.3f GB], Time [%.3f sec]: GB/s",
+           sAtomicType, totalHistogramDataInGB, perf);
+  _perfInfo = (float)(totalHistogramDataInGB / perf);
+  testDescString = buf;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20.h
new file mode 100644
index 0000000000..b3c39da048
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_AtomicSpeed20_H_
+#define _OCL_AtomicSpeed20_H_
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "OCLTestImp.h"
+
+#define DEFAULT_WG_SIZE 256
+#define NBINS 256
+#define BITS_PER_PIX 8
+#define NBANKS 16
+
+#include "OCLPerfAtomicSpeed.h"
+
+typedef struct {
+  AtomicType atomicType;
+  int inputScale;
+} testOCLPerfAtomicSpeed20Struct;
+
+// Define the OCLPerfAtomicSpeed20 class.
+class OCLPerfAtomicSpeed20 : public OCLTestImp {
+ public:
+  OCLPerfAtomicSpeed20();
+  virtual ~OCLPerfAtomicSpeed20();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  cl_command_queue cmd_queue_;
+  std::vector<cl_program> _programs;
+  std::vector<cl_kernel> _kernels;
+
+  bool _atomicsSupported;
+  bool _dataSizeTooBig;
+  cl_uint _numLoops;
+
+  // Histogram related stuff...
+ private:
+  cl_ulong _maxMemoryAllocationSize;
+  cl_uint _inputNBytes;
+  cl_uint _outputNBytes;
+
+  cl_uint _nCurrentInputScale;
+  cl_uint _workgroupSize;
+  //    cl_uint nLoops;
+  cl_uint _nThreads;
+  cl_uint _nThreadsPerGroup;
+  cl_uint _nGroups;
+  cl_uint _n4Vectors;
+  cl_uint _n4VectorsPerThread;
+  cl_uint _nBins;
+  cl_uint _nBytesLDSPerGrp;
+
+  cl_uint* _input;
+  cl_uint* _output;
+  cl_mem _inputBuffer;
+  cl_mem _outputBuffer;
+  bool skip_;
+
+  cl_uint _cpuhist[NBINS];
+  cl_uint _cpuReductionSum;
+
+  void calculateHostBin();
+  void setupHistogram();
+  bool VerifyResults(const AtomicType atomicType);
+  void ResetGlobalOutput();
+
+  // Methods that does the actual NDRange.
+  void RunGlobalHistogram(const AtomicType atomicType);
+
+  void CreateKernels(const AtomicType atomicType);
+  void SetKernelArguments(const AtomicType atomicType);
+  void PrintResults(const AtomicType atomicType, double totalTime);
+};
+
+#endif  // _OCL_AtomicSpeed20_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20Kernels.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20Kernels.h
new file mode 100644
index 0000000000..e3697c4f9a
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20Kernels.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+static const char *global_atomics_sum_reduction_all_to_zero =
+    "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+    " __kernel void global_atomics_sum_reduction_all_to_zero(uint "
+    "ItemsPerThread, __global uint *Input, __global atomic_int *Output )\n"
+    "{\n"
+    "    uint sum = 0;\n"
+    "    const uint msk =  (uint)3;\n"
+    "    const uint shft = (uint)8;\n"
+    "    \n"
+    "    uint tid = get_global_id(0);\n"
+    "    uint Stride  = get_global_size(0);\n"
+    "    for( int i = 0; i < ItemsPerThread; i++)\n"
+    "    {\n"
+    "       uint data = Input[tid];\n"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       tid += Stride;\n"
+    "    }\n"
+    "    atomic_fetch_add_explicit( &(Output[0]), sum, memory_order_relaxed, "
+    "memory_scope_device);\n"
+    "}\n";
+
+static const char *global_atomics_sum_reduction_workgroup =
+    "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+    " __kernel void global_atomics_sum_reduction_workgroup(uint "
+    "ItemsPerThread, __global uint *Input, __global atomic_int *Output )\n"
+    "{\n"
+    "    uint sum = 0;\n"
+    "    const uint msk =  (uint)3;\n"
+    "    const uint shft = (uint)8;\n"
+    "    \n"
+    "    uint tid = get_global_id(0);\n"
+    "    uint Stride  = get_global_size(0);\n"
+    "    for( int i = 0; i < ItemsPerThread; i++)\n"
+    "    {\n"
+    "       uint data = Input[tid];\n"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       tid += Stride;\n"
+    "    }\n"
+    "    atomic_fetch_add_explicit( &(Output[get_group_id(0)]), sum, "
+    "memory_order_relaxed, memory_scope_device);\n"
+    "}\n";
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeedKernels.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeedKernels.h
new file mode 100644
index 0000000000..defbff4e8f
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeedKernels.h
@@ -0,0 +1,402 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+static const char *local_atomics_histogram =
+    "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+    "#define MIN(a,b) ((a) < (b)) ? (a) : (b) \n"
+    "#define MAX(a,b) ((a) > (b)) ? (a) : (b) \n"
+    "__kernel __attribute__((reqd_work_group_size(256,1,1)))\n"
+    "void local_atomics_histogram(__global uint4 *Image,\n"
+    "__global uint  *Histogram,\n"
+    "uint  n4VectorsPerThread)\n"
+    "{\n"
+    "    __local __attribute__((aligned(16))) uint subhists[NBANKS * NBINS];\n"
+    "\n"
+    "    uint tid     = get_global_id(0);\n"
+    "    uint ltid    = get_local_id(0);\n"
+    "    uint Stride  = get_global_size(0);\n"
+    "\n"
+    "    uint i, idx;\n"
+    "    uint4 temp, temp2;\n"
+    "    const uint shft = (uint) BITS_PER_PIX;\n"
+    "    const uint msk =  (uint) (NBINS-1);\n"
+    "    uint offset = (uint) ltid % (uint) (NBANKS);\n"
+    "\n"
+    "    uint lmem_items = NBANKS * NBINS;\n"
+    "    uint lmem_items_per_thread;\n"
+    "    uint lmem_max_threads;\n"
+    "\n"
+    "    // parallel LDS clear\n"
+    "    // first, calculate threads per item, at least 1:\n"
+    "    lmem_max_threads = MIN( 1, get_local_size(0) / lmem_items );\n"
+    "    // but no more than we have items:\n"
+    "    lmem_max_threads = MAX( 1, lmem_max_threads / lmem_items );\n"
+    "    // calculate threads total:\n"
+    "    lmem_max_threads = lmem_items / lmem_max_threads;\n"
+    "    // but no more than LDS banks:\n"
+    "    lmem_max_threads = MIN( get_local_size(0), lmem_max_threads );\n"
+    "\n"
+    "    lmem_items_per_thread = lmem_items / lmem_max_threads;\n"
+    "\n"
+    "    // now, clear LDS\n"
+    "    __local uint4 *p = (__local uint4 *) subhists;\n"
+    "\n"
+    "    if( ltid < lmem_max_threads )\n"
+    "    {\n"
+    "        for(i=0, idx=ltid; i<lmem_items_per_thread/4; i++, "
+    "idx+=lmem_max_threads)\n"
+    "        {\n"
+    "            p[idx] = 0;\n"
+    "        }\n"
+    "    }\n"
+    "\n"
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "\n"
+    "    // read & scatter phase\n"
+    "\n"
+    "    for( i=0, idx=tid; i<n4VectorsPerThread; i++, idx += Stride )\n"
+    "    {\n"
+    "        temp = Image[idx];\n"
+    "        temp2 = (temp & msk) * (uint4) NBANKS + offset;\n"
+    "\n"
+    "        (void) atom_inc( subhists + temp2.x );\n"
+    "        (void) atom_inc( subhists + temp2.y );\n"
+    "        (void) atom_inc( subhists + temp2.z );\n"
+    "        (void) atom_inc( subhists + temp2.w );\n"
+    "\n"
+    "        temp = temp >> shft;\n"
+    "        temp2 = (temp & msk) * (uint4) NBANKS + offset;\n"
+    "\n"
+    "        (void) atom_inc( subhists + temp2.x );\n"
+    "        (void) atom_inc( subhists + temp2.y );\n"
+    "        (void) atom_inc( subhists + temp2.z );\n"
+    "        (void) atom_inc( subhists + temp2.w );\n"
+    "\n"
+    "        temp = temp >> shft;\n"
+    "        temp2 = (temp & msk) * (uint4) NBANKS + offset;\n"
+    "\n"
+    "        (void) atom_inc( subhists + temp2.x );\n"
+    "        (void) atom_inc( subhists + temp2.y );\n"
+    "        (void) atom_inc( subhists + temp2.z );\n"
+    "        (void) atom_inc( subhists + temp2.w );\n"
+    "\n"
+    "        temp = temp >> shft;\n"
+    "        temp2 = (temp & msk) * (uint4) NBANKS + offset;\n"
+    "\n"
+    "        (void) atom_inc( subhists + temp2.x );\n"
+    "        (void) atom_inc( subhists + temp2.y );\n"
+    "        (void) atom_inc( subhists + temp2.z );\n"
+    "        (void) atom_inc( subhists + temp2.w );\n"
+    "    }\n"
+    "\n"
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "\n"
+    "    // reduce __local banks to single histogram per work-group\n"
+    "\n"
+    "    if( ltid < NBINS )\n"
+    "    {\n"
+    "        uint bin = 0;\n"
+    "        for( i=0; i<NBANKS; i++ )\n"
+    "        {\n"
+    "            bin += subhists[ (ltid * NBANKS) + i ];\n"
+    "        }\n"
+    "        Histogram[ (get_group_id(0) * NBINS) + ltid ] = bin;\n"
+    "    }\n"
+    "}\n";
+
+static const char *local_atomics_reduce =
+    " __kernel void local_atomics_reduce( __global uint *Histogram, uint "
+    "nSubHists )\n"
+    "{\n"
+    "    uint tid = get_global_id(0);\n"
+    "    uint bin = 0;\n"
+    "    // Reduce work-group histograms into single histogram,\n"
+    "    // one thread for each bin.\n"
+    "    for( int i=0; i < nSubHists; i++ )\n"
+    "        bin += Histogram[ (i * NBINS) + tid ];\n"
+    "    Histogram[ tid ] = bin;\n"
+    "}\n";
+
+static const char *global_atomics_histogram =
+    "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+    "__kernel __attribute__((reqd_work_group_size(256,1,1)))\n"
+    "void global_atomics_histogram(uint ItemsPerThread,\n"
+    "__global uint *Input,\n"
+    "__global uint  *Histogram)\n"
+    "{\n"
+    "   uint tid = get_global_id(0);\n"
+    "   const uint shft = (uint) BITS_PER_PIX;\n"
+    "   const uint msk =  (uint) (NBINS-1);\n"
+    "   uint Stride  = get_global_size(0);\n"
+    "   for( int i = 0; i < ItemsPerThread; i++)\n"
+    "   {\n"
+    "       uint temp  = Input[tid];\n"
+    "       atom_inc( &(Histogram[ (temp & msk) ]) );\n"
+    "       temp = temp >> shft;\n"
+    "       atom_inc( &(Histogram[ (temp & msk) ]) );\n"
+    "       temp = temp >> shft;\n"
+    "       atom_inc( &(Histogram[ (temp & msk) ]) );\n"
+    "       temp = temp >> shft;\n"
+    "       atom_inc( &(Histogram[ (temp & msk) ]) );\n"
+    "       tid += Stride;"
+    "   }\n"
+    "}\n";
+
+static const char *global_vec4_atomics_histogram =
+    "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+    "__kernel __attribute__((reqd_work_group_size(256,1,1)))\n"
+    "void global_atomics_histogram(uint ItemsPerThread,\n"
+    "__global uint4 *Input,\n"
+    "__global uint  *Histogram)\n"
+    "{\n"
+    "   uint tid = get_global_id(0);\n"
+    "   const uint shft = (uint) BITS_PER_PIX;\n"
+    "   const uint msk =  (uint) (NBINS-1);\n"
+    "   uint Stride  = get_global_size(0);\n"
+    "   for( int i = 0; i < ItemsPerThread; i++)\n"
+    "   {\n"
+    "       uint4 temp  = Input[tid];\n"
+    "       atom_inc( &(Histogram[ (temp.x & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.y & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.z & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.w & msk) ]) );\n"
+    "       temp = temp >> shft;\n"
+    "       atom_inc( &(Histogram[ (temp.x & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.y & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.z & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.w & msk) ]) );\n"
+    "       temp = temp >> shft;\n"
+    "       atom_inc( &(Histogram[ (temp.x & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.y & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.z & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.w & msk) ]) );\n"
+    "       temp = temp >> shft;\n"
+    "       atom_inc( &(Histogram[ (temp.x & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.y & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.z & msk) ]) );\n"
+    "       atom_inc( &(Histogram[ (temp.w & msk) ]) );\n"
+    "       tid += Stride;"
+    "   }\n"
+    "}\n";
+
+static const char *global_atomics_sum_reduction_all_to_zero =
+    "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+    " __kernel void global_atomics_sum_reduction_all_to_zero(uint "
+    "ItemsPerThread, __global uint *Input, __global int *Output )\n"
+    "{\n"
+    "    uint sum = 0;\n"
+    "    const uint msk =  (uint)3;\n"
+    "    const uint shft = (uint)8;\n"
+    "    \n"
+    "    uint tid = get_global_id(0);\n"
+    "    uint Stride  = get_global_size(0);\n"
+    "    for( int i = 0; i < ItemsPerThread; i++)\n"
+    "    {\n"
+    "       uint data = Input[tid];\n"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       tid += Stride;\n"
+    "    }\n"
+    "    atom_add( &(Output[0]), sum);\n"
+    "}\n";
+
+static const char *global_atomics_sum_reduction_workgroup =
+    "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+    " __kernel void global_atomics_sum_reduction_workgroup(uint "
+    "ItemsPerThread, __global uint *Input, __global int *Output )\n"
+    "{\n"
+    "    uint sum = 0;\n"
+    "    const uint msk =  (uint)3;\n"
+    "    const uint shft = (uint)8;\n"
+    "    \n"
+    "    uint tid = get_global_id(0);\n"
+    "    uint Stride  = get_global_size(0);\n"
+    "    for( int i = 0; i < ItemsPerThread; i++)\n"
+    "    {\n"
+    "       uint data = Input[tid];\n"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       data = data >> shft;"
+    "       sum += data & msk;\n"
+    "       tid += Stride;\n"
+    "    }\n"
+    "    atom_add( &(Output[get_group_id(0)]), sum);\n"
+    "}\n";
+
+static const char *local_reduction =
+    "__kernel void local_reduction(__global uint* input, __global uint* "
+    "output, __local uint* sdata)\n"
+    "{\n"
+    "   // load shared mem\n"
+    "   const uint msk =  (uint)3;\n"
+    "   const uint shft = (uint)8;\n"
+    "   unsigned int tid = get_local_id(0);\n"
+    "\n"
+    "   unsigned int localSize = get_local_size(0);\n"
+    "   unsigned int stride = get_global_id(0) * 2;\n"
+    "   unsigned int data1 = input[stride];\n"
+    "   unsigned int data2 = input[stride + 1];\n"
+    "   unsigned int sum = 0;\n"
+    "   for( int i = 0; i < 4; i++)\n"
+    "   {\n"
+    "       sum += (data1 & msk) + (data2 & msk);\n"
+    "       data1 = data1 >> shft;\n"
+    "       data2 = data2 >> shft;\n"
+    "   }\n"
+    "   sdata[tid] = sum;"
+    "\n"
+    "   barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "   // do reduction in shared mem\n"
+    "   for(unsigned int s = localSize >> 1; s > 0; s >>= 1)\n"
+    "   {\n"
+    "       if(tid < s) \n"
+    "       {\n"
+    "           sdata[tid] += sdata[tid + s];\n"
+    "       }\n"
+    "       barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "   }\n"
+    "\n"
+    "   // write result for this block to global mem\n"
+    "   if(tid == 0) output[get_group_id(0)] = sdata[0];\n"
+    "}\n";
+
+static const char *local_vec4_reduction =
+    "__kernel void local_reduction(__global uint4* input, __global uint4* "
+    "output, __local uint4* sdata)\n"
+    "{\n"
+    "   // load shared mem\n"
+    "   const uint msk =  (uint)3;\n"
+    "   const uint shft = (uint)8;\n"
+    "   unsigned int tid = get_local_id(0);\n"
+    "\n"
+    "   unsigned int localSize = get_local_size(0);\n"
+    "   unsigned int stride = get_global_id(0) * 2;\n"
+    "   uint4 data1 = input[stride];\n"
+    "   uint4 data2 = input[stride + 1];\n"
+    "   uint4 sum = 0;\n"
+    "   for( int i = 0; i < 4; i++)\n"
+    "   {\n"
+    "       sum += (data1 & msk) + (data2 & msk);\n"
+    "       data1 = data1 >> shft;\n"
+    "       data2 = data2 >> shft;\n"
+    "   }\n"
+    "   sdata[tid] = sum;"
+    "\n"
+    "   barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "   // do reduction in shared mem\n"
+    "   for(unsigned int s = localSize >> 1; s > 0; s >>= 1)\n"
+    "   {\n"
+    "       if(tid < s) \n"
+    "       {\n"
+    "           sdata[tid] += sdata[tid + s];\n"
+    "       }\n"
+    "       barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "   }\n"
+    "\n"
+    "   // write result for this block to global mem\n"
+    "   if(tid == 0) output[get_group_id(0)] = sdata[0];\n"
+    "}\n";
+
+static const char *local_atomics_reduction =
+    "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+    "__kernel void local_reduction(__global uint* input, __global uint* "
+    "output, __local uint* sdata)\n"
+    "{\n"
+    "   // load shared mem\n"
+    "   const uint msk =  (uint)3;\n"
+    "   const uint shft = (uint)8;\n"
+    "   unsigned int tid = get_local_id(0);\n"
+    "\n"
+    "   unsigned int localSize = get_local_size(0);\n"
+    "   unsigned int stride = get_global_id(0) * 2;\n"
+    "   unsigned int data1 = input[stride];\n"
+    "   unsigned int data2 = input[stride + 1];\n"
+    "   unsigned int sum = 0;\n"
+    "   for( int i = 0; i < 4; i++)\n"
+    "   {\n"
+    "       sum += (data1 & msk) + (data2 & msk);\n"
+    "       data1 = data1 >> shft;\n"
+    "       data2 = data2 >> shft;\n"
+    "   }\n"
+    "   sdata[tid] = sum;"
+    "\n"
+    "   barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "   // do reduction in shared mem\n"
+    "   for(unsigned int s = localSize >> 1; s > 0; s >>= 1)\n"
+    "   {\n"
+    "       if(tid < s) \n"
+    "       {\n"
+    "           atom_add( &(sdata[tid]), sdata[tid + s]);\n"
+    "       }\n"
+    "       barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "   }\n"
+    "\n"
+    "   // write result for this block to global mem\n"
+    "   if(tid == 0) output[get_group_id(0)] = sdata[0];\n"
+    "}\n";
+
+static const char *local_vec4_atomics_reduction =
+    "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+    "__kernel void local_reduction(__global uint4* input, __global uint4* "
+    "output, __local uint4* sdata)\n"
+    "{\n"
+    "   // load shared mem\n"
+    "   const uint msk =  (uint)3;\n"
+    "   const uint shft = (uint)8;\n"
+    "   unsigned int tid = get_local_id(0);\n"
+    "\n"
+    "   unsigned int localSize = get_local_size(0);\n"
+    "   unsigned int stride = get_global_id(0) * 2;\n"
+    "   uint4 data1 = input[stride];\n"
+    "   uint4 data2 = input[stride + 1];\n"
+    "   uint4 sum = 0;\n"
+    "   for( int i = 0; i < 4; i++)\n"
+    "   {\n"
+    "       sum += (data1 & msk) + (data2 & msk);\n"
+    "       data1 = data1 >> shft;\n"
+    "       data2 = data2 >> shft;\n"
+    "   }\n"
+    "   sdata[tid] = sum;"
+    "\n"
+    "   barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "   // do reduction in shared mem\n"
+    "   for(unsigned int s = localSize >> 1; s > 0; s >>= 1)\n"
+    "   {\n"
+    "       if(tid < s) \n"
+    "       {\n"
+    "           atom_add( &(sdata[tid]).x, sdata[tid + s].x);\n"
+    "           atom_add( &(sdata[tid]).y, sdata[tid + s].y);\n"
+    "           atom_add( &(sdata[tid]).z, sdata[tid + s].z);\n"
+    "           atom_add( &(sdata[tid]).w, sdata[tid + s].w);\n"
+    "       }\n"
+    "       barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "   }\n"
+    "\n"
+    "   // write result for this block to global mem\n"
+    "   if(tid == 0) output[get_group_id(0)] = sdata[0];\n"
+    "}\n";
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopyOverhead.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopyOverhead.cpp
new file mode 100644
index 0000000000..0cfb9de532
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopyOverhead.cpp
@@ -0,0 +1,254 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfBufferCopyOverhead.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <complex>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+typedef struct {
+  unsigned int iterations;
+  int flushEvery;
+} testStruct;
+
+static testStruct testList[] = {
+    {1, -1},         {1, -1},      {10, 1},      {10, -1},      {100, 1},
+    {100, 10},       {100, -1},    {1000, 1},    {1000, 10},    {1000, 100},
+    {1000, -1},      {10000, 1},   {10000, 10},  {10000, 100},  {10000, 1000},
+    {10000, -1},     {100000, 1},  {100000, 10}, {100000, 100}, {100000, 1000},
+    {100000, 10000}, {100000, -1},
+};
+
+OCLPerfBufferCopyOverhead::OCLPerfBufferCopyOverhead() {
+  _numSubTests = 2 * 2 * sizeof(testList) / sizeof(testStruct);
+}
+
+OCLPerfBufferCopyOverhead::~OCLPerfBufferCopyOverhead() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfBufferCopyOverhead::open(unsigned int test, char *units,
+                                     double &conversion,
+                                     unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test % (sizeof(testList) / sizeof(testStruct));
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  srcBuffer_ = 0;
+  dstBuffer_ = 0;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    delete platforms;
+  }
+
+  bufSize_ = 4;
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_READ_ONLY;
+  sleep = ((test / (sizeof(testList) / sizeof(testStruct))) % 2) > 0;
+  if (test >= ((sizeof(testList) / sizeof(testStruct)) * 2)) {
+    srcHost = true;
+    flags |= CL_MEM_ALLOC_HOST_PTR;
+  } else {
+    srcHost = false;
+  }
+  srcBuffer_ =
+      _wrapper->clCreateBuffer(context_, flags, bufSize_, NULL, &error_);
+  CHECK_RESULT(srcBuffer_ == 0, "clCreateBuffer(srcBuffer) failed");
+
+  flags = CL_MEM_WRITE_ONLY;
+  if (!srcHost) {
+    flags |= CL_MEM_ALLOC_HOST_PTR;
+  }
+  dstBuffer_ =
+      _wrapper->clCreateBuffer(context_, flags, bufSize_, NULL, &error_);
+  CHECK_RESULT(dstBuffer_ == 0, "clCreateBuffer(dstBuffer) failed");
+}
+
+void OCLPerfBufferCopyOverhead::run(void) {
+  CPerfCounter timer;
+  cl_event event;
+  cl_int eventStatus;
+  unsigned int iter = testList[_openTest].iterations;
+
+  // Warm up
+  error_ = _wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, dstBuffer_, 0,
+                                         0, bufSize_, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueCopyBuffer failed");
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < iter; i++) {
+    error_ = _wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, dstBuffer_,
+                                           0, 0, bufSize_, 0, NULL, &event);
+
+    CHECK_RESULT(error_, "clEnqueueCopyBuffer failed");
+    if ((testList[_openTest].flushEvery > 0) &&
+        (((i + 1) % testList[_openTest].flushEvery) == 0)) {
+      if (sleep) {
+        _wrapper->clFinish(cmd_queue_);
+      } else {
+        _wrapper->clFlush(cmd_queue_);
+        error_ =
+            _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                     sizeof(cl_int), &eventStatus, NULL);
+        while (eventStatus > 0) {
+          error_ =
+              _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                       sizeof(cl_int), &eventStatus, NULL);
+        }
+      }
+    }
+    if (i != (iter - 1)) {
+      _wrapper->clReleaseEvent(event);
+    }
+  }
+  if (sleep) {
+    _wrapper->clFinish(cmd_queue_);
+  } else {
+    _wrapper->clFlush(cmd_queue_);
+    error_ = _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                      sizeof(cl_int), &eventStatus, NULL);
+    while (eventStatus > 0) {
+      error_ =
+          _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                   sizeof(cl_int), &eventStatus, NULL);
+    }
+  }
+  _wrapper->clReleaseEvent(event);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Buffer copy time in us
+  double perf = sec * 1000. * 1000. / iter;
+
+  const char *strSrc = NULL;
+  const char *strDst = NULL;
+  const char *strWait = NULL;
+  if (srcHost) {
+    strSrc = "host";
+    strDst = "dev";
+  } else {
+    strSrc = "dev";
+    strDst = "host";
+  }
+  if (sleep) {
+    strWait = "sleep";
+  } else {
+    strWait = "spin";
+  }
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " %5s, s:%4s d:%4s i:%6d (us) ", strWait, strSrc,
+           strDst, iter);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfBufferCopyOverhead::close(void) {
+  if (srcBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(srcBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(srcBuffer_) failed");
+  }
+  if (dstBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(dstBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(dstBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopyOverhead.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopyOverhead.h
new file mode 100644
index 0000000000..983fdd51ef
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopyOverhead.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_BufferCopyOverhead_H_
+#define _OCL_BufferCopyOverhead_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfBufferCopyOverhead : public OCLTestImp {
+ public:
+  OCLPerfBufferCopyOverhead();
+  virtual ~OCLPerfBufferCopyOverhead();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 1000;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem srcBuffer_;
+  cl_mem dstBuffer_;
+  cl_int error_;
+
+  unsigned int bufSize_;
+  bool sleep;
+  bool srcHost;
+};
+
+#endif  // _OCL_BufferCopyOverhead_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopySpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopySpeed.cpp
new file mode 100644
index 0000000000..13256a39ba
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopySpeed.cpp
@@ -0,0 +1,439 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfBufferCopySpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <complex>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 8
+// 4KB, 8KB, 64KB, 256KB, 1 MB, 4MB, 16 MB, 16MB+10
+static const unsigned int Sizes[NUM_SIZES] = {
+    4096, 8192, 65536, 262144, 1048576, 4194304, 16777216, 16777216 + 10};
+
+static const unsigned int Iterations[2] = {1, OCLPerfBufferCopySpeed::NUM_ITER};
+
+#define BUF_TYPES 4
+//  16 ways to combine 4 different buffer types
+#define NUM_SUBTESTS (BUF_TYPES * BUF_TYPES)
+
+OCLPerfBufferCopySpeed::OCLPerfBufferCopySpeed() {
+  _numSubTests = NUM_SIZES * NUM_SUBTESTS * 2;
+}
+
+OCLPerfBufferCopySpeed::~OCLPerfBufferCopySpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfBufferCopySpeed::setData(void *ptr, unsigned int size,
+                                     unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  value = 0;
+  for (unsigned int i = 0; i < size >> 2; i++) {
+    ptr2[i] = value;
+    value++;
+  }
+}
+
+void OCLPerfBufferCopySpeed::checkData(void *ptr, unsigned int size,
+                                       unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  value = 0;
+  for (unsigned int i = 0; i < size >> 2; i++) {
+    if (ptr2[i] != value) {
+      printf("Data validation failed at %d!  Got 0x%08x 0x%08x 0x%08x 0x%08x\n",
+             i, ptr2[i], ptr2[i + 1], ptr2[i + 2], ptr2[i + 3]);
+      printf("Expected 0x%08x 0x%08x 0x%08x 0x%08x\n", value, value, value,
+             value);
+      CHECK_RESULT(true, "Data validation failed!");
+      break;
+    }
+    value++;
+  }
+}
+
+void OCLPerfBufferCopySpeed::open(unsigned int test, char *units,
+                                  double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  srcBuffer_ = 0;
+  dstBuffer_ = 0;
+  persistent[0] = false;
+  persistent[1] = false;
+  allocHostPtr[0] = false;
+  allocHostPtr[1] = false;
+  useHostPtr[0] = false;
+  useHostPtr[1] = false;
+  memptr[0] = NULL;
+  memptr[1] = NULL;
+  alignedmemptr[0] = NULL;
+  alignedmemptr[1] = NULL;
+  isAMD = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        isAMD = true;
+      }
+      // platform = platforms[_platformIndex];
+      // break;
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+
+  char getVersion[128];
+  error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION,
+                                       sizeof(getVersion), getVersion, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+  platformVersion[0] = getVersion[7];
+  platformVersion[1] = getVersion[8];
+  platformVersion[2] = getVersion[9];
+  platformVersion[3] = '\0';
+  bufSize_ = Sizes[_openTest % NUM_SIZES];
+  unsigned int srcTest = (_openTest / NUM_SIZES) % BUF_TYPES;
+  unsigned int dstTest = (_openTest / (NUM_SIZES * BUF_TYPES)) % BUF_TYPES;
+  if (srcTest == 3) {
+    useHostPtr[0] = true;
+  } else if ((srcTest == 2) && isAMD) {
+    persistent[0] = true;
+  } else if (srcTest == 1) {
+    allocHostPtr[0] = true;
+  }
+  if ((dstTest == 1) && isAMD) {
+    persistent[1] = true;
+  } else if (dstTest == 2) {
+    allocHostPtr[1] = true;
+  } else if (dstTest == 3) {
+    useHostPtr[1] = true;
+  }
+
+  numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS)];
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_READ_ONLY;
+  if (persistent[0]) {
+    flags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
+  } else if (allocHostPtr[0]) {
+    flags |= CL_MEM_ALLOC_HOST_PTR;
+  } else if (useHostPtr[0]) {
+    flags |= CL_MEM_USE_HOST_PTR;
+    memptr[0] = malloc(bufSize_ + 4096);
+    alignedmemptr[0] = (void *)(((size_t)memptr[0] + 4095) & ~4095);
+  }
+  srcBuffer_ = _wrapper->clCreateBuffer(context_, flags, bufSize_,
+                                        alignedmemptr[0], &error_);
+  CHECK_RESULT(srcBuffer_ == 0, "clCreateBuffer(srcBuffer) failed");
+  void *mem;
+  mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, srcBuffer_, CL_TRUE,
+                                     CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL,
+                                     &error_);
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  setData(mem, bufSize_, 0x600df00d);
+  _wrapper->clEnqueueUnmapMemObject(cmd_queue_, srcBuffer_, mem, 0, NULL, NULL);
+
+  flags = CL_MEM_WRITE_ONLY;
+  if (persistent[1]) {
+    flags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
+  } else if (allocHostPtr[1]) {
+    flags |= CL_MEM_ALLOC_HOST_PTR;
+  } else if (useHostPtr[1]) {
+    flags |= CL_MEM_USE_HOST_PTR;
+    memptr[1] = malloc(bufSize_ + 4096);
+    alignedmemptr[1] = (void *)(((size_t)memptr[1] + 4095) & ~4095);
+  }
+  dstBuffer_ = _wrapper->clCreateBuffer(context_, flags, bufSize_,
+                                        alignedmemptr[1], &error_);
+  CHECK_RESULT(dstBuffer_ == 0, "clCreateBuffer(dstBuffer) failed");
+
+  // Force persistent memory to be on GPU
+  if (persistent[0]) {
+    cl_mem memBuffer =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed");
+
+    _wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, dstBuffer_, 0, 0,
+                                  bufSize_, 0, NULL, NULL);
+    _wrapper->clFinish(cmd_queue_);
+
+    _wrapper->clReleaseMemObject(memBuffer);
+  }
+  if (persistent[1]) {
+    cl_mem memBuffer =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed");
+
+    _wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, memBuffer, 0, 0,
+                                  bufSize_, 0, NULL, NULL);
+    _wrapper->clFinish(cmd_queue_);
+
+    _wrapper->clReleaseMemObject(memBuffer);
+  }
+}
+
+void OCLPerfBufferCopySpeed::run(void) {
+  CPerfCounter timer;
+
+  // Warm up
+  error_ = _wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, dstBuffer_, 0,
+                                         0, bufSize_, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueCopyBuffer failed");
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < numIter; i++) {
+    error_ = _wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, dstBuffer_,
+                                           0, 0, bufSize_, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueCopyBuffer failed");
+  }
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Buffer copy bandwidth in GB/s
+  double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
+
+  void *mem;
+  mem =
+      _wrapper->clEnqueueMapBuffer(cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ,
+                                   0, bufSize_, 0, NULL, NULL, &error_);
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  checkData(mem, bufSize_, 0x600df00d);
+  _wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL);
+
+  const char *strSrc = NULL;
+  const char *strDst = NULL;
+  if (persistent[0])
+    strSrc = "per";
+  else if (allocHostPtr[0])
+    strSrc = "AHP";
+  else if (useHostPtr[0])
+    strSrc = "UHP";
+  else
+    strSrc = "dev";
+  if (persistent[1])
+    strDst = "per";
+  else if (allocHostPtr[1])
+    strDst = "AHP";
+  else if (useHostPtr[1])
+    strDst = "UHP";
+  else
+    strDst = "dev";
+  // Double results when src and dst are both on device
+  if ((persistent[0] || (!allocHostPtr[0] && !useHostPtr[0])) &&
+      (persistent[1] || (!allocHostPtr[1] && !useHostPtr[1])))
+    perf *= 2.0;
+  // Double results when src and dst are both in sysmem
+  if ((allocHostPtr[0] || useHostPtr[0]) && (allocHostPtr[1] || useHostPtr[1]))
+    perf *= 2.0;
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) s:%s d:%s i:%4d (GB/s) ", bufSize_,
+           strSrc, strDst, numIter);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfBufferCopySpeed::close(void) {
+  if (srcBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(srcBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(srcBuffer_) failed");
+  }
+  if (dstBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(dstBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(dstBuffer_) failed");
+  }
+  if (memptr[0]) {
+    free(memptr[0]);
+  }
+  if (memptr[1]) {
+    free(memptr[1]);
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
+
+void OCLPerfBufferCopyRectSpeed::run(void) {
+  CPerfCounter timer;
+  size_t width = static_cast<size_t>(sqrt(static_cast<float>(bufSize_)));
+  size_t srcOrigin[3] = {0, 0, 0};
+  size_t dstOrigin[3] = {0, 0, 0};
+  size_t region[3] = {width, width, 1};
+  // Clamp iteration count for non-local writes to shorten test runtime
+  unsigned int testNumIter = numIter;
+
+  if (allocHostPtr[1]) {
+    testNumIter = (numIter < 100 ? numIter : 100);
+  }
+
+  // Skip for 1.0 platforms
+  if ((platformVersion[0] == '1') && (platformVersion[2] == '0')) {
+    char buf[256];
+    SNPRINTF(buf, sizeof(buf), " SKIPPED ");
+    testDescString = buf;
+    return;
+  }
+  // Warm up
+  error_ = _wrapper->clEnqueueCopyBufferRect(cmd_queue_, srcBuffer_, dstBuffer_,
+                                             srcOrigin, dstOrigin, region,
+                                             width, 0, width, 0, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueCopyBufferRect failed");
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < testNumIter; i++) {
+    error_ = _wrapper->clEnqueueCopyBufferRect(
+        cmd_queue_, srcBuffer_, dstBuffer_, srcOrigin, dstOrigin, region, width,
+        0, width, 0, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueCopyBufferRect failed");
+  }
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Buffer copy bandwidth in GB/s
+  double perf = ((double)bufSize_ * testNumIter * (double)(1e-09)) / sec;
+
+  const char *strSrc = NULL;
+  const char *strDst = NULL;
+  if (persistent[0])
+    strSrc = "per";
+  else if (allocHostPtr[0])
+    strSrc = "AHP";
+  else if (useHostPtr[0])
+    strSrc = "UHP";
+  else
+    strSrc = "dev";
+  if (persistent[1])
+    strDst = "per";
+  else if (allocHostPtr[1])
+    strDst = "AHP";
+  else if (useHostPtr[1])
+    strDst = "UHP";
+  else
+    strDst = "dev";
+  // Double results when src and dst are both on device
+  if ((persistent[0] || (!allocHostPtr[0] && !useHostPtr[0])) &&
+      (persistent[1] || (!allocHostPtr[1] && !useHostPtr[1])))
+    perf *= 2.0;
+  // Double results when src and dst are both in sysmem
+  if ((allocHostPtr[0] || useHostPtr[0]) && (allocHostPtr[1] || useHostPtr[1]))
+    perf *= 2.0;
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) s:%s d:%s i:%4d (GB/s) ", bufSize_,
+           strSrc, strDst, testNumIter);
+  testDescString = buf;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopySpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopySpeed.h
new file mode 100644
index 0000000000..7599cecfbd
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopySpeed.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_BufferCopySpeed_H_
+#define _OCL_BufferCopySpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfBufferCopySpeed : public OCLTestImp {
+ public:
+  OCLPerfBufferCopySpeed();
+  virtual ~OCLPerfBufferCopySpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 1000;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem srcBuffer_;
+  cl_mem dstBuffer_;
+  cl_int error_;
+
+  unsigned int bufSize_;
+  bool persistent[2];
+  bool allocHostPtr[2];
+  bool useHostPtr[2];
+  unsigned int numIter;
+  bool isAMD;
+  char platformVersion[32];
+  void setData(void* ptr, unsigned int size, unsigned int value);
+  void checkData(void* ptr, unsigned int size, unsigned int value);
+  void* memptr[2];
+  void* alignedmemptr[2];
+};
+
+class OCLPerfBufferCopyRectSpeed : public OCLPerfBufferCopySpeed {
+ public:
+  OCLPerfBufferCopyRectSpeed() : OCLPerfBufferCopySpeed() {}
+
+ public:
+  virtual void run(void);
+};
+#endif  // _OCL_BufferCopySpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferReadSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferReadSpeed.cpp
new file mode 100644
index 0000000000..ca076d3c6e
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferReadSpeed.cpp
@@ -0,0 +1,334 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfBufferReadSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <complex>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 8
+// 256KB, 1 MB, 4MB, 16 MB
+static const unsigned int Sizes[NUM_SIZES] = {
+    1024, 32 * 1024, 64 * 1024, 128 * 1024, 262144, 1048576, 4194304, 16777216};
+
+static cl_uint blockedSubtests;
+
+static const unsigned int Iterations[2] = {1, OCLPerfBufferReadSpeed::NUM_ITER};
+#define NUM_OFFSETS 1
+static const unsigned int offsets[NUM_OFFSETS] = {0};
+#define NUM_SUBTESTS (3 + NUM_OFFSETS)
+extern const char *blkStr[2];
+
+OCLPerfBufferReadSpeed::OCLPerfBufferReadSpeed() {
+  _numSubTests = NUM_SIZES * NUM_SUBTESTS * 2;
+  blockedSubtests = _numSubTests;
+  _numSubTests += NUM_SIZES * NUM_SUBTESTS;
+}
+
+OCLPerfBufferReadSpeed::~OCLPerfBufferReadSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfBufferReadSpeed::open(unsigned int test, char *units,
+                                  double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  outBuffer_ = 0;
+  persistent = false;
+  allocHostPtr = false;
+  useHostPtr = false;
+  hostMem = NULL;
+  alignedMem = NULL;
+  alignment = 4096;
+  isAMD = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        isAMD = true;
+      }
+      // platform = platforms[_platformIndex];
+      // break;
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  char getVersion[128];
+  error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION,
+                                       sizeof(getVersion), getVersion, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+  platformVersion[0] = getVersion[7];
+  platformVersion[1] = getVersion[8];
+  platformVersion[2] = getVersion[9];
+  platformVersion[3] = '\0';
+  bufSize_ = Sizes[_openTest % NUM_SIZES];
+
+  if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 2) {
+    useHostPtr = true;
+    offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 3];
+  } else if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 2) && isAMD) {
+    persistent = true;
+  } else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 1) {
+    allocHostPtr = true;
+  }
+
+  if (_openTest < blockedSubtests) {
+    numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS)];
+  } else {
+    numIter =
+        4 * OCLPerfBufferReadSpeed::NUM_ITER / ((_openTest % NUM_SIZES) + 1);
+  }
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_WRITE_ONLY;
+  if (persistent) {
+    flags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
+  } else if (allocHostPtr) {
+    flags |= CL_MEM_ALLOC_HOST_PTR;
+  } else if (useHostPtr) {
+    flags |= CL_MEM_USE_HOST_PTR;
+    hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset);
+    CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed");
+    alignedMem =
+        (char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) +
+                 offset);
+  }
+  outBuffer_ =
+      _wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  // Force memory to be on GPU if possible
+  {
+    cl_mem memBuffer =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed");
+
+    _wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, outBuffer_, 0, 0,
+                                  bufSize_, 0, NULL, NULL);
+    _wrapper->clFinish(cmd_queue_);
+
+    _wrapper->clReleaseMemObject(memBuffer);
+  }
+}
+
+void OCLPerfBufferReadSpeed::run(void) {
+  CPerfCounter timer;
+  char *mem = new char[bufSize_];
+  cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE;
+
+  // Warm up
+  error_ = _wrapper->clEnqueueReadBuffer(cmd_queue_, outBuffer_, CL_TRUE, 0,
+                                         bufSize_, mem, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueReadBuffer failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < numIter; i++) {
+    error_ = _wrapper->clEnqueueReadBuffer(cmd_queue_, outBuffer_, blocking, 0,
+                                           bufSize_, mem, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueReadBuffer failed");
+  }
+  if (blocking != CL_TRUE) {
+    _wrapper->clFinish(cmd_queue_);
+  }
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Buffer read bandwidth in GB/s
+  double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char str[256];
+  if (persistent) {
+    SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)");
+  } else if (allocHostPtr) {
+    SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)");
+  } else if (useHostPtr) {
+    SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset);
+  } else {
+    SNPRINTF(str, sizeof(str), "(GB/s)");
+  }
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %29s ", bufSize_,
+           blkStr[blocking], numIter, str);
+  testDescString = buf;
+
+  delete mem;
+}
+
+unsigned int OCLPerfBufferReadSpeed::close(void) {
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+  if (hostMem) {
+    free(hostMem);
+  }
+
+  return _crcword;
+}
+
+void OCLPerfBufferReadRectSpeed::run(void) {
+  CPerfCounter timer;
+  char *mem = new char[bufSize_];
+  size_t width = static_cast<size_t>(sqrt(static_cast<float>(bufSize_)));
+  size_t bufOrigin[3] = {0, 0, 0};
+  size_t hostOrigin[3] = {0, 0, 0};
+  size_t region[3] = {width, width, 1};
+  cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE;
+
+  // Clamp iterations to reduce run time
+  unsigned int testNumIter;
+  testNumIter = (numIter < 100 ? numIter : 100);
+
+  // Skip for 1.0 platforms
+  if ((platformVersion[0] == '1') && (platformVersion[2] == '0')) {
+    char buf[256];
+    SNPRINTF(buf, sizeof(buf), " SKIPPED ");
+    testDescString = buf;
+    return;
+  }
+  // Warm up
+  error_ = _wrapper->clEnqueueReadBufferRect(
+      cmd_queue_, outBuffer_, CL_TRUE, bufOrigin, hostOrigin, region, width, 0,
+      width, 0, mem, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueReadBufferRect failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < testNumIter; i++) {
+    error_ = _wrapper->clEnqueueReadBufferRect(
+        cmd_queue_, outBuffer_, blocking, bufOrigin, hostOrigin, region, width,
+        0, width, 0, mem, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueReadBufferRect failed");
+  }
+  if (blocking != CL_TRUE) {
+    _wrapper->clFinish(cmd_queue_);
+  }
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Buffer read bandwidth in GB/s
+  double perf = ((double)bufSize_ * testNumIter * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char str[256];
+  if (persistent) {
+    SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)");
+  } else if (allocHostPtr) {
+    SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)");
+  } else if (useHostPtr) {
+    SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset);
+  } else {
+    SNPRINTF(str, sizeof(str), "(GB/s)");
+  }
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %29s ", bufSize_,
+           blkStr[blocking], numIter, str);
+  testDescString = buf;
+
+  delete mem;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferReadSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferReadSpeed.h
new file mode 100644
index 0000000000..01df4a5815
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferReadSpeed.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_BufferReadSpeed_H_
+#define _OCL_BufferReadSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfBufferReadSpeed : public OCLTestImp {
+ public:
+  OCLPerfBufferReadSpeed();
+  virtual ~OCLPerfBufferReadSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 1000;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int bufSize_;
+  bool persistent;
+  bool allocHostPtr;
+  bool useHostPtr;
+  unsigned int numIter;
+  char* hostMem;
+  char* alignedMem;
+  size_t alignment;
+  unsigned int offset;
+  bool isAMD;
+  char platformVersion[32];
+};
+
+class OCLPerfBufferReadRectSpeed : public OCLPerfBufferReadSpeed {
+ public:
+  OCLPerfBufferReadRectSpeed() : OCLPerfBufferReadSpeed() {}
+
+ public:
+  virtual void run(void);
+};
+
+#endif  // _OCL_BufferReadSpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferWriteSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferWriteSpeed.cpp
new file mode 100644
index 0000000000..76cae8dfc3
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferWriteSpeed.cpp
@@ -0,0 +1,333 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfBufferWriteSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <complex>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 8
+// 256KB, 1 MB, 4MB, 16 MB
+static const unsigned int Sizes[NUM_SIZES] = {
+    1024, 32 * 1024, 64 * 1024, 128 * 1024, 262144, 1048576, 4194304, 16777216};
+
+static cl_uint blockedSubtests;
+
+static const unsigned int Iterations[2] = {1,
+                                           OCLPerfBufferWriteSpeed::NUM_ITER};
+
+#define NUM_OFFSETS 1
+static const unsigned int offsets[NUM_OFFSETS] = {0};
+#define NUM_SUBTESTS (3 + NUM_OFFSETS)
+extern const char *blkStr[2];
+
+OCLPerfBufferWriteSpeed::OCLPerfBufferWriteSpeed() {
+  _numSubTests = NUM_SIZES * NUM_SUBTESTS * 2;
+  blockedSubtests = _numSubTests;
+  _numSubTests += NUM_SIZES * NUM_SUBTESTS;
+}
+
+OCLPerfBufferWriteSpeed::~OCLPerfBufferWriteSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfBufferWriteSpeed::open(unsigned int test, char *units,
+                                   double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  outBuffer_ = 0;
+  persistent = false;
+  allocHostPtr = false;
+  useHostPtr = false;
+  hostMem = NULL;
+  alignedMem = NULL;
+  alignment = 4096;
+  isAMD = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        isAMD = true;
+      }
+      // platform = platforms[_platformIndex];
+      // break;
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  char getVersion[128];
+  error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION,
+                                       sizeof(getVersion), getVersion, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+  platformVersion[0] = getVersion[7];
+  platformVersion[1] = getVersion[8];
+  platformVersion[2] = getVersion[9];
+  platformVersion[3] = '\0';
+  bufSize_ = Sizes[_openTest % NUM_SIZES];
+
+  if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 2) {
+    useHostPtr = true;
+    offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 3];
+  } else if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 2) && isAMD) {
+    persistent = true;
+  } else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 1) {
+    allocHostPtr = true;
+  }
+
+  if (_openTest < blockedSubtests) {
+    numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS)];
+  } else {
+    numIter =
+        4 * OCLPerfBufferWriteSpeed::NUM_ITER / ((_openTest % NUM_SIZES) + 1);
+  }
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_READ_ONLY;
+  if (persistent) {
+    flags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
+  } else if (allocHostPtr) {
+    flags |= CL_MEM_ALLOC_HOST_PTR;
+  } else if (useHostPtr) {
+    flags |= CL_MEM_USE_HOST_PTR;
+    hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset);
+    CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed");
+    alignedMem =
+        (char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) +
+                 offset);
+  }
+  outBuffer_ =
+      _wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  // Force memory to be on GPU if possible
+  {
+    cl_mem memBuffer =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed");
+
+    _wrapper->clEnqueueCopyBuffer(cmd_queue_, outBuffer_, memBuffer, 0, 0,
+                                  bufSize_, 0, NULL, NULL);
+    _wrapper->clFinish(cmd_queue_);
+
+    _wrapper->clReleaseMemObject(memBuffer);
+  }
+}
+
+void OCLPerfBufferWriteSpeed::run(void) {
+  CPerfCounter timer;
+  char *mem = new char[bufSize_];
+  cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE;
+
+  // Warm up
+  error_ = _wrapper->clEnqueueWriteBuffer(cmd_queue_, outBuffer_, CL_TRUE, 0,
+                                          bufSize_, mem, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueReadBuffer failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < numIter; i++) {
+    error_ = _wrapper->clEnqueueWriteBuffer(cmd_queue_, outBuffer_, blocking, 0,
+                                            bufSize_, mem, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueReadBuffer failed");
+  }
+  if (blocking != CL_TRUE) {
+    _wrapper->clFinish(cmd_queue_);
+  }
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Buffer write bandwidth in GB/s
+  double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char str[256];
+  if (persistent) {
+    SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)");
+  } else if (allocHostPtr) {
+    SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)");
+  } else if (useHostPtr) {
+    SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset);
+  } else {
+    SNPRINTF(str, sizeof(str), "(GB/s)");
+  }
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %29s ", bufSize_,
+           blkStr[blocking], numIter, str);
+  testDescString = buf;
+
+  delete mem;
+}
+
+unsigned int OCLPerfBufferWriteSpeed::close(void) {
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+  if (hostMem) {
+    free(hostMem);
+  }
+
+  return _crcword;
+}
+
+void OCLPerfBufferWriteRectSpeed::run(void) {
+  CPerfCounter timer;
+  char *mem = new char[bufSize_];
+  size_t width = static_cast<size_t>(sqrt(static_cast<float>(bufSize_)));
+  size_t bufOrigin[3] = {0, 0, 0};
+  size_t hostOrigin[3] = {0, 0, 0};
+  size_t region[3] = {width, width, 1};
+  cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE;
+
+  // Skip for 1.0 platforms
+  if ((platformVersion[0] == '1') && (platformVersion[2] == '0')) {
+    char buf[256];
+    SNPRINTF(buf, sizeof(buf), " SKIPPED ");
+    testDescString = buf;
+    return;
+  }
+  // Warm up
+  error_ = _wrapper->clEnqueueWriteBufferRect(
+      cmd_queue_, outBuffer_, CL_TRUE, bufOrigin, hostOrigin, region, width, 0,
+      width, 0, mem, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueReadBufferRect failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < numIter; i++) {
+    error_ = _wrapper->clEnqueueWriteBufferRect(
+        cmd_queue_, outBuffer_, blocking, bufOrigin, hostOrigin, region, width,
+        0, width, 0, mem, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueReadBufferRect failed");
+  }
+  if (blocking != CL_TRUE) {
+    _wrapper->clFinish(cmd_queue_);
+  }
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Buffer write bandwidth in GB/s
+  double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char str[256];
+  if (persistent) {
+    SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)");
+  } else if (allocHostPtr) {
+    SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)");
+  } else if (useHostPtr) {
+    SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset);
+  } else {
+    SNPRINTF(str, sizeof(str), "(GB/s)");
+  }
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %29s ", bufSize_,
+           blkStr[blocking], numIter, str);
+  testDescString = buf;
+
+  delete mem;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferWriteSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferWriteSpeed.h
new file mode 100644
index 0000000000..19e062d172
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferWriteSpeed.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_BufferWriteSpeed_H_
+#define _OCL_BufferWriteSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfBufferWriteSpeed : public OCLTestImp {
+ public:
+  OCLPerfBufferWriteSpeed();
+  virtual ~OCLPerfBufferWriteSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 1000;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int bufSize_;
+  bool persistent;
+  bool allocHostPtr;
+  bool useHostPtr;
+  unsigned int numIter;
+  char* hostMem;
+  char* alignedMem;
+  size_t alignment;
+  unsigned int offset;
+  bool isAMD;
+  char platformVersion[32];
+};
+
+class OCLPerfBufferWriteRectSpeed : public OCLPerfBufferWriteSpeed {
+ public:
+  OCLPerfBufferWriteRectSpeed() : OCLPerfBufferWriteSpeed() {}
+
+ public:
+  virtual void run(void);
+};
+
+#endif  // _OCL_BufferWriteSpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCPUMemSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCPUMemSpeed.cpp
new file mode 100644
index 0000000000..3e108f5b48
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCPUMemSpeed.cpp
@@ -0,0 +1,304 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfCPUMemSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <algorithm>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 4
+// 256KB, 1 MB, 4MB, 16 MB
+static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304,
+                                              16777216};
+
+#define ITER_COUNT 2
+static const unsigned int Iterations[2] = {1, OCLPerfCPUMemSpeed::NUM_ITER};
+#define NUM_OFFSETS 1
+static const unsigned int offsets[NUM_OFFSETS] = {0};
+#define NUM_SUBTESTS (3 + NUM_OFFSETS)
+OCLPerfCPUMemSpeed::OCLPerfCPUMemSpeed() {
+  _numSubTests = NUM_SIZES * NUM_SUBTESTS * ITER_COUNT * 3;
+}
+
+OCLPerfCPUMemSpeed::~OCLPerfCPUMemSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfCPUMemSpeed::open(unsigned int test, char *units,
+                              double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  outBuffer_ = 0;
+  persistent = false;
+  allocHostPtr = false;
+  useHostPtr = false;
+  hostMem = NULL;
+  alignedMem = NULL;
+  alignment = 4096;
+  testMemset = false;
+  isAMD = false;
+  gpuSrc = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+      isAMD = true;
+    }
+
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    CHECK_RESULT(num_devices == 0, "No devices found, cannot proceed");
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    // if (num_devices > 0)
+    //{
+    //    platform = platforms[_platformIndex];
+    //    break;
+    //}
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  bufSize_ = Sizes[_openTest % NUM_SIZES];
+  if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 2) {
+    useHostPtr = true;
+    offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 3];
+  } else if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 2) && isAMD) {
+    persistent = true;
+  } else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 1) {
+    allocHostPtr = true;
+  }
+
+  numIter = Iterations[(_openTest / (NUM_SIZES * NUM_SUBTESTS)) % 2];
+  if (_openTest >= (NUM_SIZES * NUM_SUBTESTS * ITER_COUNT * 2))
+    testMemset = true;
+  else if (_openTest >= (NUM_SIZES * NUM_SUBTESTS * ITER_COUNT)) {
+    gpuSrc = true;
+    numIter = std::min(numIter, 10u);
+  }
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags;
+  if (gpuSrc) {
+    flags = CL_MEM_WRITE_ONLY;
+    mapFlags = CL_MAP_READ;
+  } else {
+    flags = CL_MEM_READ_ONLY;
+    mapFlags = CL_MAP_WRITE;
+  }
+  if (persistent) {
+    flags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
+  } else if (allocHostPtr) {
+    flags |= CL_MEM_ALLOC_HOST_PTR;
+  } else if (useHostPtr) {
+    flags |= CL_MEM_USE_HOST_PTR;
+    hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset);
+    CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed");
+    alignedMem =
+        (char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) +
+                 offset);
+  }
+  outBuffer_ =
+      _wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  // Force memory to be on GPU if possible
+  {
+    cl_mem memBuffer =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed");
+
+    _wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, outBuffer_, 0, 0,
+                                  bufSize_, 0, NULL, NULL);
+    _wrapper->clFinish(cmd_queue_);
+
+    _wrapper->clReleaseMemObject(memBuffer);
+  }
+}
+
+void OCLPerfCPUMemSpeed::run(void) {
+  CPerfCounter timer;
+
+  void *mem;
+  // Warm up
+  mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE, mapFlags,
+                                     0, bufSize_, 0, NULL, NULL, &error_);
+
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0,
+                                             NULL, NULL);
+  CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed");
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE, mapFlags,
+                                     0, bufSize_, 0, NULL, NULL, &error_);
+
+  char *cpumem = new char[bufSize_];
+
+  timer.Reset();
+  timer.Start();
+  if (testMemset) {
+    for (unsigned int i = 0; i < numIter; i++) {
+      memset(mem, 0, bufSize_);
+    }
+  } else {
+    if (gpuSrc) {
+      for (unsigned int i = 0; i < numIter; i++) {
+        memcpy((void *)cpumem, mem, bufSize_);
+      }
+    } else {
+      for (unsigned int i = 0; i < numIter; i++) {
+        memcpy(mem, (void *)cpumem, bufSize_);
+      }
+    }
+  }
+
+  timer.Stop();
+
+  delete[] cpumem;
+
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0,
+                                             NULL, NULL);
+  CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed");
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  double sec = timer.GetElapsedTime();
+
+  // Map read bandwidth in GB/s
+  double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
+  _perfInfo = (float)perf;
+
+  char str[256];
+  if (persistent) {
+    SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)");
+  } else if (allocHostPtr) {
+    SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)");
+  } else if (useHostPtr) {
+    SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset);
+  } else {
+    SNPRINTF(str, sizeof(str), "(GB/s)");
+  }
+  const char *str2 = NULL;
+  if (testMemset)
+    str2 = "memset to dev";
+  else {
+    if (gpuSrc)
+      str2 = "memcpy from dev";
+    else
+      str2 = "memcpy to dev";
+  }
+
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) %15s i: %4d %29s ", bufSize_, str2,
+           numIter, str);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfCPUMemSpeed::close(void) {
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+  if (hostMem) {
+    free(hostMem);
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCPUMemSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCPUMemSpeed.h
new file mode 100644
index 0000000000..3313d53795
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCPUMemSpeed.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_CPUMemSpeed_H_
+#define _OCL_CPUMemSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfCPUMemSpeed : public OCLTestImp {
+ public:
+  OCLPerfCPUMemSpeed();
+  virtual ~OCLPerfCPUMemSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 100;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int bufSize_;
+  bool persistent;
+  bool allocHostPtr;
+  bool useHostPtr;
+  unsigned int numIter;
+  bool testMemset;
+  char* hostMem;
+  char* alignedMem;
+  size_t alignment;
+  unsigned int offset;
+  bool isAMD;
+  bool gpuSrc;
+  cl_map_flags mapFlags;
+};
+
+#endif  // _OCL_CPUMemSpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCommandQueue.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCommandQueue.cpp
new file mode 100644
index 0000000000..81b2b676ae
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCommandQueue.cpp
@@ -0,0 +1,146 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfCommandQueue.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include <sstream>
+#include <string>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+static const size_t BufSize = 0x1000;
+static const size_t Iterations = 0x100;
+static const size_t TotalQueues = 4;
+static const size_t TotalBufs = 4;
+
+OCLPerfCommandQueue::OCLPerfCommandQueue() {
+  _numSubTests = TotalQueues * TotalBufs;
+  failed_ = false;
+}
+
+OCLPerfCommandQueue::~OCLPerfCommandQueue() {}
+
+void OCLPerfCommandQueue::open(unsigned int test, char* units,
+                               double& conversion, unsigned int deviceId) {
+  cl_mem buffer;
+  _deviceId = deviceId;
+  CPerfCounter timer;
+  timer.Reset();
+  timer.Start();
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  timer.Stop();
+  if (test == 0) {
+    printf("Runtime load/init time: %0.2f ms\n",
+           static_cast<float>(timer.GetElapsedTime() * 1000));
+  }
+  test_ = test;
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    printf("GPU device is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+  static const size_t MemObjects[] = {1, 100, 1000, 5000};
+  size_t numMems = MemObjects[test_ / TotalBufs];
+  size_t bufSize = BufSize * sizeof(cl_int4);
+  for (size_t b = 0; b < numMems; ++b) {
+    buffer = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, bufSize,
+                                      NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+    buffers_.push_back(buffer);
+  }
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLPerfCommandQueue::run(void) {
+  if (failed_) {
+    return;
+  }
+  unsigned int* values;
+  values = reinterpret_cast<unsigned int*>(new cl_int4[BufSize]);
+  CPerfCounter timer;
+  static const size_t Queues[] = {1, 2, 4, 8};
+  size_t numQueues = Queues[test_ % TotalQueues];
+
+  // Clear destination buffer
+  memset(values, 0, BufSize * sizeof(cl_int4));
+
+  size_t iter =
+      Iterations / (numQueues * ((size_t)1 << (test_ / TotalBufs + 1)));
+  std::vector<cl_command_queue> cmdQueues(numQueues);
+
+  timer.Reset();
+  timer.Start();
+
+  for (size_t i = 0; i < iter; ++i) {
+    for (size_t q = 0; q < numQueues; ++q) {
+      cl_command_queue cmdQueue = _wrapper->clCreateCommandQueue(
+          context_, devices_[_deviceId], 0, &error_);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed");
+      cmdQueues[q] = cmdQueue;
+    }
+    timer.Stop();
+    for (size_t q = 0; q < numQueues; ++q) {
+      for (size_t b = 0; b < buffers_.size(); ++b) {
+        error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues[q], buffers_[b],
+                                                CL_TRUE, 0, sizeof(cl_int4),
+                                                values, 0, NULL, NULL);
+      }
+    }
+    timer.Start();
+    for (size_t q = 0; q < numQueues; ++q) {
+      error_ = _wrapper->clReleaseCommandQueue(cmdQueues[q]);
+      CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                             "clReleaseCommandQueue() failed");
+    }
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+  }
+
+  timer.Stop();
+
+  std::stringstream stream;
+
+  stream << "Create+destroy time for " << numQueues << " queues and "
+         << buffers_.size() << " buffers";
+  stream.precision(3);
+  stream.width(5);
+  stream.setf(std::ios::fixed, std::ios::floatfield);
+  stream << "(ms)";
+  testDescString = stream.str();
+  _perfInfo =
+      static_cast<float>(timer.GetElapsedTime() * 1000 / (iter * numQueues));
+  delete[] values;
+}
+
+unsigned int OCLPerfCommandQueue::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCommandQueue.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCommandQueue.h
new file mode 100644
index 0000000000..cd6f710a18
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCommandQueue.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PERF_COMMAND_QUEUE_H_
+#define _OCL_PERF_COMMAND_QUEUE_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfCommandQueue : public OCLTestImp {
+ public:
+  OCLPerfCommandQueue();
+  virtual ~OCLPerfCommandQueue();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+  unsigned int test_;
+};
+
+#endif  // _OCL_PERF_COMMAND_QUEUE_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfConcurrency.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfConcurrency.cpp
new file mode 100644
index 0000000000..0c4ba342ef
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfConcurrency.cpp
@@ -0,0 +1,563 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfConcurrency.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+typedef struct {
+  double x;
+  double y;
+  double width;
+} coordRec;
+
+static coordRec coords[] = {
+    {0.0, 0.0, 0.00001},  // All black
+};
+
+static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
+
+static const char *float_mandel_vec =
+    "__kernel void mandelbrot(__global uint *out, uint width, float xPos, "
+    "float yPos, float xStep, float yStep, uint maxIter)\n"
+    "{\n"
+    "    int tid = get_global_id(0);\n"
+    "    int i = tid % (width/4);\n"
+    "    int j = tid / (width/4);\n"
+    "    int4 veci = (int4)(4*i, 4*i+1, 4*i+2, 4*i+3);\n"
+    "    int4 vecj = (int4)(j, j, j, j);\n"
+    "    float4 x0;\n"
+    "    x0.s0 = (float)(xPos + xStep*veci.s0);\n"
+    "    x0.s1 = (float)(xPos + xStep*veci.s1);\n"
+    "    x0.s2 = (float)(xPos + xStep*veci.s2);\n"
+    "    x0.s3 = (float)(xPos + xStep*veci.s3);\n"
+    "    float4 y0;\n"
+    "    y0.s0 = (float)(yPos + yStep*vecj.s0);\n"
+    "    y0.s1 = (float)(yPos + yStep*vecj.s1);\n"
+    "    y0.s2 = (float)(yPos + yStep*vecj.s2);\n"
+    "    y0.s3 = (float)(yPos + yStep*vecj.s3);\n"
+    "\n"
+    "    float4 x = x0;\n"
+    "    float4 y = y0;\n"
+    "\n"
+    "    uint iter = 0;\n"
+    "    float4 tmp;\n"
+    "    int4 stay;\n"
+    "    int4 ccount = 0;\n"
+    "    float4 savx = x;\n"
+    "    float4 savy = y;\n"
+    "    stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n"
+    "    for (iter = 0; (stay.s0 | stay.s1 | stay.s2 | stay.s3) && (iter < "
+    "maxIter); iter+=16)\n"
+    "    {\n"
+    "        x = savx;\n"
+    "        y = savy;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n"
+    "        savx = (stay ? x : savx);\n"
+    "        savy = (stay ? y : savy);\n"
+    "        ccount -= stay*16;\n"
+    "    }\n"
+    "    // Handle remainder\n"
+    "    if (!(stay.s0 & stay.s1 & stay.s2 & stay.s3))\n"
+    "    {\n"
+    "        iter = 16;\n"
+    "        do\n"
+    "        {\n"
+    "            x = savx;\n"
+    "            y = savy;\n"
+    "            // More efficient to use scalar ops here: Why?\n"
+    "            stay.s0 = ((x.s0*x.s0+y.s0*y.s0) <= 4.0f) && (ccount.s0 < "
+    "maxIter);\n"
+    "            stay.s1 = ((x.s1*x.s1+y.s1*y.s1) <= 4.0f) && (ccount.s1 < "
+    "maxIter);\n"
+    "            stay.s2 = ((x.s2*x.s2+y.s2*y.s2) <= 4.0f) && (ccount.s2 < "
+    "maxIter);\n"
+    "            stay.s3 = ((x.s3*x.s3+y.s3*y.s3) <= 4.0f) && (ccount.s3 < "
+    "maxIter);\n"
+    "            tmp = x;\n"
+    "            x = x*x + x0 - y*y;\n"
+    "            y = 2.0f*tmp*y + y0;\n"
+    "            ccount += stay;\n"
+    "            iter--;\n"
+    "            savx.s0 = (stay.s0 ? x.s0 : savx.s0);\n"
+    "            savx.s1 = (stay.s1 ? x.s1 : savx.s1);\n"
+    "            savx.s2 = (stay.s2 ? x.s2 : savx.s2);\n"
+    "            savx.s3 = (stay.s3 ? x.s3 : savx.s3);\n"
+    "            savy.s0 = (stay.s0 ? y.s0 : savy.s0);\n"
+    "            savy.s1 = (stay.s1 ? y.s1 : savy.s1);\n"
+    "            savy.s2 = (stay.s2 ? y.s2 : savy.s2);\n"
+    "            savy.s3 = (stay.s3 ? y.s3 : savy.s3);\n"
+    "        } while ((stay.s0 | stay.s1 | stay.s2 | stay.s3) && iter);\n"
+    "    }\n"
+    "    __global uint4 *vecOut = (__global uint4 *)out;\n"
+    "    vecOut[tid] = convert_uint4(ccount);\n"
+    "}\n";
+
+OCLPerfConcurrency::OCLPerfConcurrency() { _numSubTests = 10 * numCoords; }
+
+OCLPerfConcurrency::~OCLPerfConcurrency() {}
+
+void OCLPerfConcurrency::setData(cl_mem buffer, unsigned int val) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_[0], buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  for (unsigned int i = 0; i < width_; i++) data[i] = val;
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_[0], buffer, data, 0,
+                                             NULL, NULL);
+  _wrapper->clFinish(cmd_queue_[0]);
+}
+
+void OCLPerfConcurrency::checkData(cl_mem buffer) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_[0], buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  totalIters = 0;
+  for (unsigned int i = 0; i < width_; i++) {
+    totalIters += data[i];
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_[0], buffer, data, 0,
+                                             NULL, NULL);
+  _wrapper->clFinish(cmd_queue_[0]);
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfConcurrency::open(unsigned int test, char *units,
+                              double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  unsigned int i;
+
+  if (type_ != CL_DEVICE_TYPE_GPU) {
+    char msg[256];
+    SNPRINTF(msg, sizeof(msg), "No GPU devices present. Exiting!\t");
+    testDescString = msg;
+    return;
+  }
+
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+
+  for (i = 0; i < MAX_ASYNC_QUEUES; i++) {
+    cmd_queue_[i] = 0;
+    program_[i] = 0;
+    kernel_[i] = 0;
+    outBuffer_[i] = 0;
+  }
+
+  // Maximum iteration count
+  // NOTE: Some kernels are unrolled 16 times, so make sure maxIter is divisible
+  // by 16 NOTE: Can increase to get better peak performance numbers, but be
+  // sure not to TDR slow ASICs! NOTE:. for warmup run we use maxIter = 256 and
+  // then for the actual run we use maxIter = 8388608 * (engine_clock / 1000).
+  maxIter = 256;
+
+  // NOTE: Width needs to be divisible by 4 because the float_mandel_vec kernel
+  // processes 4 pixels at once NOTE: Can increase to get better peak
+  // performance numbers, but be sure not to TDR slow ASICs!
+  width_ = 256;
+
+  // We compute a square domain
+  bufSize_ = width_ * sizeof(cl_uint);
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    // if (num_devices > 0)
+    //{
+    //    platform = platforms[_platformIndex];
+    //    break;
+    //}
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  cl_uint numAsyncQueues;
+  error_ = _wrapper->clGetDeviceInfo(
+      device, CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD, sizeof(numAsyncQueues),
+      &numAsyncQueues, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  CHECK_RESULT(numAsyncQueues > MAX_ASYNC_QUEUES,
+               "numAsyncQueues is too large for this test");
+
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
+                                     sizeof(size_t), &numCUs, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  switch (_openTest) {
+    case 0:
+      num_cmd_queues = num_programs = num_kernels = num_outbuffers = 1;
+      break;
+
+    case 1:
+      num_cmd_queues = 1;
+      num_programs = 1;
+      num_kernels = 1;
+      num_outbuffers = 2;
+      break;
+
+    case 2:
+      num_cmd_queues = 1;
+      num_programs = 2;
+      num_kernels = 2;
+      num_outbuffers = 2;
+      break;
+
+    case 3:
+      num_cmd_queues = num_programs = num_kernels = num_outbuffers = 2;
+      break;
+
+    case 4:
+    case 5:
+    case 6:
+    case 7:
+    case 8:
+    case 9:
+      num_cmd_queues = num_programs = num_kernels = num_outbuffers =
+          numAsyncQueues % 8;
+      break;
+
+    default:
+      break;
+  }
+
+  for (i = 0; i < num_cmd_queues; i++) {
+    cmd_queue_[i] = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+    CHECK_RESULT(cmd_queue_[i] == 0, "clCreateCommandQueue failed");
+  }
+
+  for (i = 0; i < num_outbuffers; i++) {
+    outBuffer_[i] =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(outBuffer_[i] == 0, "clCreateBuffer(outBuffer) failed");
+  }
+
+  const char *tmp;
+  tmp = float_mandel_vec;
+
+  for (i = 0; i < num_programs; i++) {
+    program_[i] = _wrapper->clCreateProgramWithSource(
+        context_, 1, (const char **)&tmp, NULL, &error_);
+    CHECK_RESULT(program_[i] == 0, "clCreateProgramWithSource failed");
+
+    error_ = _wrapper->clBuildProgram(program_[i], 1, &device, "", NULL, NULL);
+
+    if (error_ != CL_SUCCESS) {
+      cl_int intError;
+      char log[16384];
+      intError = _wrapper->clGetProgramBuildInfo(
+          program_[i], device, CL_PROGRAM_BUILD_LOG, 16384 * sizeof(char), log,
+          NULL);
+      printf("Build error -> %s\n", log);
+
+      CHECK_RESULT(0, "clBuildProgram failed");
+    }
+  }
+
+  for (i = 0; i < num_kernels; i++) {
+    kernel_[i] = _wrapper->clCreateKernel(program_[i], "mandelbrot", &error_);
+    CHECK_RESULT(kernel_[i] == 0, "clCreateKernel failed");
+  }
+
+  coordIdx = _openTest % numCoords;
+  float xStep = (float)(coords[coordIdx].width / (double)width_);
+  float yStep = (float)(-coords[coordIdx].width / (double)width_);
+  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
+  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
+
+  for (i = 0; i < num_kernels; i++) {
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 0, sizeof(cl_mem),
+                                      (void *)&outBuffer_[i]);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 1, sizeof(cl_uint),
+                                      (void *)&width_);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 2, sizeof(cl_float),
+                                      (void *)&xPos);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 3, sizeof(cl_float),
+                                      (void *)&yPos);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 4, sizeof(cl_float),
+                                      (void *)&xStep);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 5, sizeof(cl_float),
+                                      (void *)&yStep);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 6, sizeof(cl_uint),
+                                      (void *)&maxIter);
+  }
+
+  for (i = 0; i < num_outbuffers; i++) {
+    setData(outBuffer_[i], 0xdeadbeef);
+  }
+
+  unsigned int clkFrequency = 0;
+  error_ = clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY,
+                           sizeof(clkFrequency), &clkFrequency, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  assert(clkFrequency > 0);
+  maxIter =
+      (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs) / 128);
+  maxIter = (maxIter + 15) & ~15;
+}
+
+void OCLPerfConcurrency::run(void) {
+  // Test runs only on GPU
+  if (type_ != CL_DEVICE_TYPE_GPU) return;
+
+  int global = width_ >> 2;
+  // We handle 4 pixels per thread
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+  unsigned int i;
+
+  // Warmup
+  for (i = 0; i < num_kernels; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_[i % num_cmd_queues], kernel_[i], 1, NULL,
+        (const size_t *)global_work_size, (const size_t *)local_work_size, 0,
+        NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+
+  for (i = 0; i < num_cmd_queues; i++) {
+    _wrapper->clFlush(cmd_queue_[i]);
+  }
+
+  for (i = 0; i < num_cmd_queues; i++) {
+    _wrapper->clFinish(cmd_queue_[i]);
+  }
+
+  for (i = 0; i < num_kernels; i++) {
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 6, sizeof(cl_uint),
+                                      (void *)&maxIter);
+  }
+
+  CPerfCounter timer;
+
+  timer.Reset();
+  timer.Start();
+
+  for (i = 0; i < num_kernels; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_[i % num_cmd_queues], kernel_[i], 1, NULL,
+        (const size_t *)global_work_size, (const size_t *)local_work_size, 0,
+        NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+
+  if (_openTest == 1) {
+    error_ = _wrapper->clSetKernelArg(kernel_[0], 0, sizeof(cl_mem),
+                                      (void *)&outBuffer_[1]);
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_[0], kernel_[0], 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+
+  for (i = 0; i < num_cmd_queues; i++) {
+    _wrapper->clFlush(cmd_queue_[i]);
+  }
+
+  for (i = 0; i < num_cmd_queues; i++) {
+    _wrapper->clFinish(cmd_queue_[i]);
+  }
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  unsigned long long expected =
+      (unsigned long long)width_ * (unsigned long long)maxIter;
+
+  for (i = 0; i < num_outbuffers; i++) {
+    checkData(outBuffer_[i]);
+    CHECK_RESULT(totalIters != expected, "Incorrect iteration count detected!");
+  }
+
+  _perfInfo = (float)sec;
+  if (_openTest == 0)
+    testDescString = "time for 1 kernel  (s)               ";
+  else if (_openTest == 1)
+    testDescString = "time for 2 kernels (s) (same kernel) ";
+  else if (_openTest == 2)
+    testDescString = "time for 2 kernels (s) (diff kernels)";
+  else {
+    char buf[128];
+    SNPRINTF(buf, sizeof(buf), "time for %d kernels (s) (   %d queues) ",
+             num_kernels, num_cmd_queues);
+    testDescString = buf;
+  }
+}
+
+unsigned int OCLPerfConcurrency::close(void) {
+  unsigned int i;
+
+  // Test runs only on GPU
+  if (type_ != CL_DEVICE_TYPE_GPU) return 0;
+
+  _wrapper->clFinish(cmd_queue_[0]);
+
+  for (i = 0; i < num_outbuffers; i++) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_[i]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+
+  for (i = 0; i < num_kernels; i++) {
+    error_ = _wrapper->clReleaseKernel(kernel_[i]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseKernel(kernel_) failed");
+  }
+
+  for (i = 0; i < num_programs; i++) {
+    error_ = _wrapper->clReleaseProgram(program_[i]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseProgram(program_) failed");
+  }
+
+  for (i = 0; i < num_cmd_queues; i++) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_[i]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfConcurrency.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfConcurrency.h
new file mode 100644
index 0000000000..850e146b04
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfConcurrency.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_Perf_Concurrency_H_
+#define _OCL_Perf_Concurrency_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfConcurrency : public OCLTestImp {
+ public:
+  OCLPerfConcurrency();
+  virtual ~OCLPerfConcurrency();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void setData(cl_mem buffer, unsigned int data);
+  void checkData(cl_mem buffer);
+
+#define MAX_ASYNC_QUEUES 8
+
+  cl_context context_;
+  cl_command_queue cmd_queue_[MAX_ASYNC_QUEUES];
+  cl_program program_[MAX_ASYNC_QUEUES];
+  cl_kernel kernel_[MAX_ASYNC_QUEUES];
+  cl_mem outBuffer_[MAX_ASYNC_QUEUES];
+  cl_int error_;
+
+  unsigned int num_cmd_queues;
+  unsigned int num_programs;
+  unsigned int num_kernels;
+  unsigned int num_outbuffers;
+
+  unsigned int width_;
+  unsigned int bufSize_;
+  unsigned int maxIter;
+  unsigned int coordIdx;
+  unsigned long long totalIters;
+  size_t numCUs;
+};
+
+#endif  // _OCL_Perf_Concurrency_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemReadSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemReadSpeed.cpp
new file mode 100644
index 0000000000..deb61efa8b
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemReadSpeed.cpp
@@ -0,0 +1,243 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfDevMemReadSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 1
+static const unsigned int Sizes[NUM_SIZES] = {256 * 1024 * 1024};
+
+const static char *strKernel =
+    "__kernel void read_kernel(__global uint16 *src, ulong size1, uint "
+    "threads, __global uint* dst\n"
+    "                          )\n"
+    "{\n"
+    "    uint16 pval;\n"
+    "    int idx = get_global_id(0);\n"
+    "    __global uint16 *srcEnd = src + size1;\n"
+    "     uint tmp = 0;\n"
+    "    src = &src[idx];"
+    "    while (src < srcEnd) \n"
+    "        {\n"
+    "            pval = *src;\n"
+    "            src += threads;\n"
+    "            tmp += pval.s0 + pval.s1 + pval.s2 + pval.s3 + pval.s4 + pval.s5 + pval.s6 + \
+  pval.s7 + pval.s8 + pval.s9 + pval.sa + pval.sb + pval.sc + pval.sd + pval.se + pval.sf;\n"
+    "        }\n"
+    "    atomic_add(dst, tmp);\n"
+    "}\n";
+
+OCLPerfDevMemReadSpeed::OCLPerfDevMemReadSpeed() { _numSubTests = 1; }
+
+OCLPerfDevMemReadSpeed::~OCLPerfDevMemReadSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfDevMemReadSpeed::open(unsigned int test, char *units,
+                                  double &conversion, unsigned int deviceId) {
+  error_ = CL_SUCCESS;
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  program_ = 0;
+  kernel_ = 0;
+  skip_ = false;
+  dstBuffer_ = 0;
+  nBytes = Sizes[0];
+  cl_ulong loopCnt = nBytes / (16 * sizeof(cl_uint));
+  cl_uint maxCUs;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId],
+                                     CL_DEVICE_MAX_COMPUTE_UNITS,
+                                     sizeof(cl_uint), &maxCUs, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  wgs = 64;
+  const static cl_uint wavesPerCU = 8;
+  nWorkItems = maxCUs * wavesPerCU * wgs;
+
+  inputData = 0x1;
+  nIter = 1000;
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "read_kernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  srcBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, nBytes,
+                                        NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer(srcBuffer) failed");
+  void *mem;
+  mem = _wrapper->clEnqueueMapBuffer(cmdQueues_[_deviceId], srcBuffer_, CL_TRUE,
+                                     CL_MAP_READ | CL_MAP_WRITE, 0, nBytes, 0,
+                                     NULL, NULL, &error_);
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  for (unsigned int i = 0; i < nBytes / sizeof(cl_uint); ++i) {
+    reinterpret_cast<cl_uint *>(mem)[i] = inputData;
+  }
+
+  dstBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
+                                        sizeof(cl_uint), NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer(dstBuffer) failed");
+  _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], srcBuffer_, mem, 0,
+                                    NULL, NULL);
+  mem = _wrapper->clEnqueueMapBuffer(cmdQueues_[_deviceId], dstBuffer_, CL_TRUE,
+                                     CL_MAP_READ | CL_MAP_WRITE, 0,
+                                     sizeof(cl_uint), 0, NULL, NULL, &error_);
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  memset(mem, 0, sizeof(cl_uint));
+  _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], dstBuffer_, mem, 0,
+                                    NULL, NULL);
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &srcBuffer_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_ulong), (void *)&loopCnt);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint),
+                                    (void *)&nWorkItems);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_mem), (void *)&dstBuffer_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+}
+
+void OCLPerfDevMemReadSpeed::run(void) {
+  if (skip_) {
+    return;
+  }
+
+  CPerfCounter timer;
+
+  size_t gws[1] = {nWorkItems};
+  size_t lws[1] = {wgs};
+
+  // warm up
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, lws, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  cl_uint *memResult;
+  memResult = (cl_uint *)malloc(sizeof(cl_uint));
+  if (0 == memResult) {
+    CHECK_RESULT_NO_RETURN(0, "malloc failed!\n");
+    return;
+  }
+
+  memset(memResult, 0, sizeof(cl_uint));
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], dstBuffer_,
+                                         CL_FALSE, 0, sizeof(cl_uint),
+                                         memResult, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueReadBuffer dstBuffer_ failed!");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  if (memResult[0] != (nBytes / sizeof(cl_uint))) {
+    CHECK_RESULT_NO_RETURN(0, "Data validation failed for warm up run!\n");
+    free(memResult);
+    return;
+  }
+
+  free(memResult);
+
+  timer.Reset();
+  timer.Start();
+  double sec2 = 0;
+  cl_event *events = new cl_event[nIter];
+  for (unsigned int i = 0; i < nIter; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmdQueues_[_deviceId], kernel_, 1, NULL, gws, lws, 0, NULL, &events[i]);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  }
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  timer.Stop();
+  for (unsigned int i = 0; i < nIter; i++) {
+    cl_ulong startTime = 0, endTime = 0;
+    error_ = _wrapper->clGetEventProfilingInfo(
+        events[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0);
+    CHECK_RESULT(error_, "clGetEventProfilingInfo failed");
+    error_ = _wrapper->clGetEventProfilingInfo(
+        events[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0);
+    CHECK_RESULT(error_, "clGetEventProfilingInfo failed");
+
+    _wrapper->clReleaseEvent(events[i]);
+    sec2 += endTime - startTime;
+  }
+  double sec = timer.GetElapsedTime();
+  delete[] events;
+
+  // read speed in GB/s
+  double perf = ((double)nBytes * nIter * (double)(1e-09)) / sec;
+  double perf2 = ((double)nBytes * nIter) / sec2;
+  _perfInfo = (float)perf2;
+  float perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) i:%4d Wall time Perf: %.2f (GB/s)",
+           nBytes, nIter, perfInfo);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfDevMemReadSpeed::close(void) {
+  if (!skip_) {
+    if (srcBuffer_) {
+      error_ = _wrapper->clReleaseMemObject(srcBuffer_);
+      CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                             "clReleaseMemObject(srcBuffer_) failed");
+    }
+
+    if (dstBuffer_) {
+      error_ = _wrapper->clReleaseMemObject(dstBuffer_);
+      CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                             "clReleaseMemObject(srcBuffer_) failed");
+    }
+  }
+
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemReadSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemReadSpeed.h
new file mode 100644
index 0000000000..631b185229
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemReadSpeed.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_DevMemReadSpeed_H_
+#define _OCL_DevMemReadSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfDevMemReadSpeed : public OCLTestImp {
+ public:
+  OCLPerfDevMemReadSpeed();
+  virtual ~OCLPerfDevMemReadSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  cl_mem srcBuffer_;
+  cl_mem dstBuffer_;
+  unsigned int nWorkItems;  // number of GPU work items
+  unsigned int wgs;         // work group size
+  unsigned int nBytes;      // input and output buffer size
+  unsigned int nIter;       // overall number of timing loops
+  cl_uint inputData;        // input data to fill the input buffer
+  bool skip_;
+};
+
+#endif  // _OCL_DevMemReadSpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemWriteSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemWriteSpeed.cpp
new file mode 100644
index 0000000000..83992db7a9
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemWriteSpeed.cpp
@@ -0,0 +1,212 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfDevMemWriteSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 1
+static const unsigned int Sizes[NUM_SIZES] = {256 * 1024 * 1024};
+
+const static char *strKernel =
+
+    "__kernel void write_kernel(__global uint16 *dst, ulong size1, uint "
+    "threads\n"
+    "                          )\n"
+    "{\n"
+    "    uint16 pval = (uint16)(0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab,\
+ 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab);\n"
+    "    int idx = get_global_id(0);\n"
+    "    __global uint16 *dstEnd = dst + size1;\n"
+    "    dst = &dst[idx];"
+    "    do\n"
+    "        {\n"
+    "            *dst = pval;\n"
+    "            dst += threads;\n"
+    "        }\n"
+    "    while (dst < dstEnd);\n"
+    "}\n";
+
+OCLPerfDevMemWriteSpeed::OCLPerfDevMemWriteSpeed() { _numSubTests = 1; }
+
+OCLPerfDevMemWriteSpeed::~OCLPerfDevMemWriteSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfDevMemWriteSpeed::open(unsigned int test, char *units,
+                                   double &conversion, unsigned int deviceId) {
+  error_ = CL_SUCCESS;
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  program_ = 0;
+  kernel_ = 0;
+  skip_ = false;
+  dstBuffer_ = 0;
+  nBytes = Sizes[0];
+  cl_ulong loopCnt = nBytes / (16 * sizeof(cl_uint));
+  cl_uint maxCUs;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId],
+                                     CL_DEVICE_MAX_COMPUTE_UNITS,
+                                     sizeof(cl_uint), &maxCUs, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  wgs = 64;
+  const static cl_uint wavesPerCU = 8;
+  nWorkItems = maxCUs * wavesPerCU * wgs;
+  inputData = 0xabababab;
+  nIter = 1000;
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "write_kernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  dstBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, nBytes,
+                                        NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer(dstBuffer) failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &dstBuffer_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_ulong), (void *)&loopCnt);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint),
+                                    (void *)&nWorkItems);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+}
+
+void OCLPerfDevMemWriteSpeed::run(void) {
+  if (skip_) {
+    return;
+  }
+
+  CPerfCounter timer;
+
+  size_t gws[1] = {nWorkItems};
+  size_t lws[1] = {wgs};
+
+  // warm up
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, lws, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  cl_uint *memResult;
+  memResult = (cl_uint *)malloc(nBytes);
+  if (0 == memResult) {
+    CHECK_RESULT_NO_RETURN(0, "malloc failed!\n");
+    return;
+  }
+
+  memset(memResult, 0, nBytes);
+  error_ =
+      _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], dstBuffer_, CL_FALSE,
+                                    0, nBytes, memResult, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueReadBuffer dstBuffer_ failed!");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  for (unsigned int i = 0; i < nBytes / sizeof(cl_uint); i++) {
+    if (((cl_uint *)memResult)[i] != inputData) {
+      CHECK_RESULT_NO_RETURN(0, "Data validation failed for warm up run!\n");
+      free(memResult);
+      return;
+    }
+  }
+
+  free(memResult);
+
+  timer.Reset();
+  timer.Start();
+  double sec2 = 0;
+  cl_event *events = new cl_event[nIter];
+  for (unsigned int i = 0; i < nIter; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmdQueues_[_deviceId], kernel_, 1, NULL, gws, lws, 0, NULL, &events[i]);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  }
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  timer.Stop();
+  for (unsigned int i = 0; i < nIter; i++) {
+    cl_ulong startTime = 0, endTime = 0;
+    error_ = _wrapper->clGetEventProfilingInfo(
+        events[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0);
+    CHECK_RESULT(error_, "clGetEventProfilingInfo failed");
+    error_ = _wrapper->clGetEventProfilingInfo(
+        events[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0);
+    CHECK_RESULT(error_, "clGetEventProfilingInfo failed");
+
+    _wrapper->clReleaseEvent(events[i]);
+    sec2 += endTime - startTime;
+  }
+  double sec = timer.GetElapsedTime();
+  delete[] events;
+
+  // write speed in GB/s
+  double perf = ((double)nBytes * nIter * (double)(1e-09)) / sec;
+  double perf2 = ((double)nBytes * nIter) / sec2;
+  _perfInfo = (float)perf2;
+  float perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) i:%4d Wall time Perf: %.2f (GB/s)",
+           nBytes, nIter, perfInfo);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfDevMemWriteSpeed::close(void) {
+  if (!skip_) {
+    if (dstBuffer_) {
+      error_ = _wrapper->clReleaseMemObject(dstBuffer_);
+      CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                             "clReleaseMemObject(srcBuffer_) failed");
+    }
+  }
+
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemWriteSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemWriteSpeed.h
new file mode 100644
index 0000000000..7bdfdc70b1
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemWriteSpeed.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_DevMemWriteSpeed_H_
+#define _OCL_DevMemWriteSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfDevMemWriteSpeed : public OCLTestImp {
+ public:
+  OCLPerfDevMemWriteSpeed();
+  virtual ~OCLPerfDevMemWriteSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  cl_mem dstBuffer_;
+  unsigned int nWorkItems;  // number of GPU work items
+  unsigned int wgs;         // work group size
+  unsigned int nBytes;      // output buffer size
+  unsigned int nIter;       // overall number of timing loops
+  cl_uint inputData;        // input data to fill the input buffer
+  bool skip_;
+};
+
+#endif  // _OCL_DevMemWriteSpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceConcurrency.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceConcurrency.cpp
new file mode 100644
index 0000000000..3c6c97e14a
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceConcurrency.cpp
@@ -0,0 +1,480 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfDeviceConcurrency.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+typedef struct {
+  double x;
+  double y;
+  double width;
+} coordRec;
+
+static coordRec coords[] = {
+    {0.0, 0.0, 0.00001},  // All black
+};
+
+static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
+
+static const char *float_mandel_vec =
+    "__kernel void mandelbrot(__global uint *out, uint width, float xPos, "
+    "float yPos, float xStep, float yStep, uint maxIter)\n"
+    "{\n"
+    "    int tid = get_global_id(0);\n"
+    "    int i = tid % (width/4);\n"
+    "    int j = tid / (width/4);\n"
+    "    int4 veci = (int4)(4*i, 4*i+1, 4*i+2, 4*i+3);\n"
+    "    int4 vecj = (int4)(j, j, j, j);\n"
+    "    float4 x0;\n"
+    "    x0.s0 = (float)(xPos + xStep*veci.s0);\n"
+    "    x0.s1 = (float)(xPos + xStep*veci.s1);\n"
+    "    x0.s2 = (float)(xPos + xStep*veci.s2);\n"
+    "    x0.s3 = (float)(xPos + xStep*veci.s3);\n"
+    "    float4 y0;\n"
+    "    y0.s0 = (float)(yPos + yStep*vecj.s0);\n"
+    "    y0.s1 = (float)(yPos + yStep*vecj.s1);\n"
+    "    y0.s2 = (float)(yPos + yStep*vecj.s2);\n"
+    "    y0.s3 = (float)(yPos + yStep*vecj.s3);\n"
+    "\n"
+    "    float4 x = x0;\n"
+    "    float4 y = y0;\n"
+    "\n"
+    "    uint iter = 0;\n"
+    "    float4 tmp;\n"
+    "    int4 stay;\n"
+    "    int4 ccount = 0;\n"
+    "    float4 savx = x;\n"
+    "    float4 savy = y;\n"
+    "    stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n"
+    "    for (iter = 0; (stay.s0 | stay.s1 | stay.s2 | stay.s3) && (iter < "
+    "maxIter); iter+=16)\n"
+    "    {\n"
+    "        x = savx;\n"
+    "        y = savy;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = x*x + x0 - y*y;\n"
+    "        y = 2.0f * x * y + y0;\n"
+    "        x = tmp*tmp + x0 - y*y;\n"
+    "        y = 2.0f * tmp * y + y0;\n"
+    "\n"
+    "        stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n"
+    "        savx = (stay ? x : savx);\n"
+    "        savy = (stay ? y : savy);\n"
+    "        ccount -= stay*16;\n"
+    "    }\n"
+    "    // Handle remainder\n"
+    "    if (!(stay.s0 & stay.s1 & stay.s2 & stay.s3))\n"
+    "    {\n"
+    "        iter = 16;\n"
+    "        do\n"
+    "        {\n"
+    "            x = savx;\n"
+    "            y = savy;\n"
+    "            // More efficient to use scalar ops here: Why?\n"
+    "            stay.s0 = ((x.s0*x.s0+y.s0*y.s0) <= 4.0f) && (ccount.s0 < "
+    "maxIter);\n"
+    "            stay.s1 = ((x.s1*x.s1+y.s1*y.s1) <= 4.0f) && (ccount.s1 < "
+    "maxIter);\n"
+    "            stay.s2 = ((x.s2*x.s2+y.s2*y.s2) <= 4.0f) && (ccount.s2 < "
+    "maxIter);\n"
+    "            stay.s3 = ((x.s3*x.s3+y.s3*y.s3) <= 4.0f) && (ccount.s3 < "
+    "maxIter);\n"
+    "            tmp = x;\n"
+    "            x = x*x + x0 - y*y;\n"
+    "            y = 2.0f*tmp*y + y0;\n"
+    "            ccount += stay;\n"
+    "            iter--;\n"
+    "            savx.s0 = (stay.s0 ? x.s0 : savx.s0);\n"
+    "            savx.s1 = (stay.s1 ? x.s1 : savx.s1);\n"
+    "            savx.s2 = (stay.s2 ? x.s2 : savx.s2);\n"
+    "            savx.s3 = (stay.s3 ? x.s3 : savx.s3);\n"
+    "            savy.s0 = (stay.s0 ? y.s0 : savy.s0);\n"
+    "            savy.s1 = (stay.s1 ? y.s1 : savy.s1);\n"
+    "            savy.s2 = (stay.s2 ? y.s2 : savy.s2);\n"
+    "            savy.s3 = (stay.s3 ? y.s3 : savy.s3);\n"
+    "        } while ((stay.s0 | stay.s1 | stay.s2 | stay.s3) && iter);\n"
+    "    }\n"
+    "    __global uint4 *vecOut = (__global uint4 *)out;\n"
+    "    vecOut[tid] = convert_uint4(ccount);\n"
+    "}\n";
+
+OCLPerfDeviceConcurrency::OCLPerfDeviceConcurrency() {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+
+    platform = platforms[_platformIndex];
+    num_devices = 0;
+    /* Get the number of requested devices */
+
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    if (num_devices > MAX_DEVICES) {
+      num_devices = MAX_DEVICES;
+    }
+    delete platforms;
+  }
+  _numSubTests = num_devices;
+}
+
+OCLPerfDeviceConcurrency::~OCLPerfDeviceConcurrency() {}
+
+void OCLPerfDeviceConcurrency::setData(cl_mem buffer, unsigned int idx,
+                                       unsigned int val) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_[idx], buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  for (unsigned int i = 0; i < width_; i++) data[i] = val;
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_[idx], buffer, data, 0,
+                                             NULL, NULL);
+  _wrapper->clFinish(cmd_queue_[idx]);
+}
+
+void OCLPerfDeviceConcurrency::checkData(cl_mem buffer, unsigned int idx) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_[idx], buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  totalIters = 0;
+  for (unsigned int i = 0; i < width_; i++) {
+    totalIters += data[i];
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_[idx], buffer, data, 0,
+                                             NULL, NULL);
+  _wrapper->clFinish(cmd_queue_[idx]);
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfDeviceConcurrency::open(unsigned int test, char *units,
+                                    double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  num_devices = 0;
+  cl_device_id *devices = NULL;
+  unsigned int i;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+
+  for (i = 0; i < MAX_DEVICES; i++) {
+    cmd_queue_[i] = 0;
+    program_[i] = 0;
+    kernel_[i] = 0;
+    outBuffer_[i] = 0;
+  }
+
+  // Maximum iteration count
+  // NOTE: Some kernels are unrolled 16 times, so make sure maxIter is divisible
+  // by 16 NOTE: Can increase to get better peak performance numbers, but be
+  // sure not to TDR slow ASICs! NOTE:. for warmup run we use maxIter = 256 and
+  // then for the actual run we use maxIter = 8388608 * (engine_clock / 1000).
+  maxIter = 256;
+
+  // NOTE: Width needs to be divisible by 4 because the float_mandel_vec kernel
+  // processes 4 pixels at once NOTE: Can increase to get better peak
+  // performance numbers, but be sure not to TDR slow ASICs!
+  width_ = 256;
+
+  // We compute a square domain
+  bufSize_ = width_ * sizeof(cl_uint);
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    if (num_devices > MAX_DEVICES) {
+      num_devices = MAX_DEVICES;
+    }
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested devices */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  context_ = _wrapper->clCreateContext(NULL, num_devices, devices,
+                                       notify_callback, NULL, &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cur_devices = _openTest + 1;
+
+  for (i = 0; i < cur_devices; i++) {
+    cmd_queue_[i] =
+        _wrapper->clCreateCommandQueue(context_, devices[i], 0, NULL);
+    CHECK_RESULT(cmd_queue_[i] == 0, "clCreateCommandQueue failed");
+    outBuffer_[i] =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(outBuffer_[i] == 0, "clCreateBuffer(outBuffer) failed");
+  }
+
+  const char *tmp;
+  tmp = float_mandel_vec;
+
+  for (i = 0; i < cur_devices; i++) {
+    program_[i] = _wrapper->clCreateProgramWithSource(
+        context_, 1, (const char **)&tmp, NULL, &error_);
+    CHECK_RESULT(program_[i] == 0, "clCreateProgramWithSource failed");
+
+    error_ =
+        _wrapper->clBuildProgram(program_[i], 1, &devices[i], "", NULL, NULL);
+
+    if (error_ != CL_SUCCESS) {
+      cl_int intError;
+      char log[16384];
+      intError = _wrapper->clGetProgramBuildInfo(
+          program_[i], devices[i], CL_PROGRAM_BUILD_LOG, 16384 * sizeof(char),
+          log, NULL);
+      printf("Build error on device %d -> %s\n", i, log);
+
+      CHECK_RESULT(0, "clBuildProgram failed");
+    }
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    kernel_[i] = _wrapper->clCreateKernel(program_[i], "mandelbrot", &error_);
+    CHECK_RESULT(kernel_[i] == 0, "clCreateKernel failed");
+  }
+
+  coordIdx = _openTest % numCoords;
+  float xStep = (float)(coords[coordIdx].width / (double)width_);
+  float yStep = (float)(-coords[coordIdx].width / (double)width_);
+  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
+  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
+
+  for (i = 0; i < cur_devices; i++) {
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 0, sizeof(cl_mem),
+                                      (void *)&outBuffer_[i]);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 1, sizeof(cl_uint),
+                                      (void *)&width_);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 2, sizeof(cl_float),
+                                      (void *)&xPos);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 3, sizeof(cl_float),
+                                      (void *)&yPos);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 4, sizeof(cl_float),
+                                      (void *)&xStep);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 5, sizeof(cl_float),
+                                      (void *)&yStep);
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 6, sizeof(cl_uint),
+                                      (void *)&maxIter);
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    setData(outBuffer_[i], i, 0xdeadbeef);
+  }
+
+  cl_uint clkFrequency = 0;
+  error_ = clGetDeviceInfo(devices[0], CL_DEVICE_MAX_CLOCK_FREQUENCY,
+                           sizeof(clkFrequency), &clkFrequency, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  assert(clkFrequency > 0);
+  maxIter = (unsigned int)(8388608 * ((float)clkFrequency / 1000));
+  maxIter = (maxIter + 15) & ~15;
+}
+
+void OCLPerfDeviceConcurrency::run(void) {
+  int global = width_ >> 2;
+  // We handle 4 pixels per thread
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+  unsigned int i;
+
+  // Warmup
+  for (i = 0; i < cur_devices; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_[i], kernel_[i], 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    _wrapper->clFlush(cmd_queue_[i]);
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    _wrapper->clFinish(cmd_queue_[i]);
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    error_ = _wrapper->clSetKernelArg(kernel_[i], 6, sizeof(cl_uint),
+                                      (void *)&maxIter);
+  }
+
+  CPerfCounter timer;
+
+  timer.Reset();
+  timer.Start();
+
+  for (i = 0; i < cur_devices; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_[i], kernel_[i], 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    _wrapper->clFlush(cmd_queue_[i]);
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    _wrapper->clFinish(cmd_queue_[i]);
+  }
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  unsigned long long expected =
+      (unsigned long long)width_ * (unsigned long long)maxIter;
+
+  for (i = 0; i < cur_devices; i++) {
+    checkData(outBuffer_[i], i);
+    CHECK_RESULT(totalIters != expected, "Incorrect iteration count detected!");
+  }
+
+  _perfInfo = (float)sec;
+  char buf[128];
+  SNPRINTF(buf, sizeof(buf), "time for %2d devices (s) (%2d queues) ",
+           cur_devices, cur_devices);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfDeviceConcurrency::close(void) {
+  unsigned int i;
+
+  for (i = 0; i < cur_devices; i++) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_[i]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    error_ = _wrapper->clReleaseKernel(kernel_[i]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseKernel(kernel_) failed");
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    error_ = _wrapper->clReleaseProgram(program_[i]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseProgram(program_) failed");
+  }
+
+  for (i = 0; i < cur_devices; i++) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_[i]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceConcurrency.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceConcurrency.h
new file mode 100644
index 0000000000..eed83632a0
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceConcurrency.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_Perf_DeviceConcurrency_H_
+#define _OCL_Perf_DeviceConcurrency_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfDeviceConcurrency : public OCLTestImp {
+ public:
+  OCLPerfDeviceConcurrency();
+  virtual ~OCLPerfDeviceConcurrency();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void setData(cl_mem buffer, unsigned int idx, unsigned int data);
+  void checkData(cl_mem buffer, unsigned int idx);
+
+#define MAX_DEVICES 16
+
+  cl_context context_;
+  cl_command_queue cmd_queue_[MAX_DEVICES];
+  cl_program program_[MAX_DEVICES];
+  cl_kernel kernel_[MAX_DEVICES];
+  cl_mem outBuffer_[MAX_DEVICES];
+  cl_int error_;
+
+  cl_uint num_devices;
+  cl_uint cur_devices;
+
+  unsigned int width_;
+  unsigned int bufSize_;
+  unsigned int maxIter;
+  unsigned int coordIdx;
+  unsigned long long totalIters;
+};
+
+#endif  // _OCL_Perf_DeviceConcurrency_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue.cpp
new file mode 100644
index 0000000000..3efecf5ff1
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue.cpp
@@ -0,0 +1,227 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfDeviceEnqueue.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define KERNEL_CODE(...) #__VA_ARGS__
+
+typedef struct {
+  unsigned int threads;
+} testStruct;
+
+static testStruct testList[] = {
+    {64}, {128}, {256}, {512}, {1024}, {2048}, {4096},
+};
+
+const static char* strKernel = {KERNEL_CODE(
+    \n __kernel void childKernel(__global uint* buf) {
+  int idx = get_global_id(0);
+  if (idx < 0) {
+    buf[idx] = 0;
+  }
+}
+    \n __kernel void parentKernel(__global uint* buf) {
+  queue_t def_q = get_default_queue();
+  ndrange_t ndrange = ndrange_1D(64, 64);
+  int gid = get_global_id(0);
+
+  int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, ^{
+    childKernel(buf);
+  });
+}
+    \n)};
+
+OCLPerfDeviceEnqueue::OCLPerfDeviceEnqueue() {
+  testListSize = sizeof(testList) / sizeof(testStruct);
+  _numSubTests = 7 * testListSize;
+  deviceQueue_ = NULL;
+  failed_ = false;
+  kernel2_ = NULL;
+}
+
+OCLPerfDeviceEnqueue::~OCLPerfDeviceEnqueue() {}
+
+void OCLPerfDeviceEnqueue::open(unsigned int test, char* units,
+                                double& conversion, unsigned int deviceId) {
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  testID_ = test;
+
+  threads = testList[testID_ % testListSize].threads;
+  size_t param_size = 0;
+  char* strVersion = 0;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
+                                     0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strVersion = new char[param_size];
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
+                                     param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strVersion[7] < '2') {
+    failed_ = true;
+    return;
+  }
+  delete strVersion;
+  cl_uint maxDevQSize = 0;
+#if defined(CL_VERSION_2_0)
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId],
+                                     CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE,
+                                     sizeof(cl_uint), &maxDevQSize, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+#endif
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "parentKernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  kernel2_ = _wrapper->clCreateKernel(program_, "childKernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR, 2048, NULL,
+                                    &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+
+  // Hardcoded for us
+  if (testID_ >= testListSize) {
+    queueSize = (1 << (testID_ / testListSize)) * 256 * 1024;
+    queueSize = std::min(queueSize, maxDevQSize);
+    threads *= (1 << (testID_ / testListSize - 1));
+    threads = std::min(threads, queueSize / 128);
+  } else {
+    queueSize = std::max((cl_uint)threads * 128, (cl_uint)16384);
+  }
+
+#if defined(CL_VERSION_2_0)
+  const cl_queue_properties cprops[] = {
+      CL_QUEUE_PROPERTIES,
+      static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+                                       CL_QUEUE_ON_DEVICE_DEFAULT |
+                                       CL_QUEUE_ON_DEVICE),
+      CL_QUEUE_SIZE, queueSize, 0};
+  deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties(
+      context_, devices_[deviceId], cprops, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateCommandQueueWithProperties() failed");
+#endif
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLPerfDeviceEnqueue::run(void) {
+  CPerfCounter timer;
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  if (failed_) return;
+
+  cl_mem buffer = buffers()[0];
+
+  size_t gws[1] = {threads};
+  size_t lws[1] = {64};
+
+  if (gws[0] >= 256) {
+    lws[0] = 256;
+  }
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, lws, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  // Try to normalize the amount of work per test
+  unsigned int repeats = (64 / threads) * 50;
+  if (repeats == 0) repeats = 1;
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < repeats; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, gws, lws, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+    _wrapper->clFinish(cmdQueues_[_deviceId]);
+  }
+  timer.Stop();
+
+  double sec = timer.GetElapsedTime();
+
+  _perfInfo = (float)(threads * repeats) / (float)(sec * 1000000.);
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf),
+           "%7d threads spawning 64 threads, queue size %5dKB (Mdisp/s)",
+           threads, queueSize / 1024);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfDeviceEnqueue::close(void) {
+  // FIXME: Re-enable CPU test once bug 10143 is fixed.
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return 0;
+  }
+
+  if (NULL != deviceQueue_) {
+    _wrapper->clReleaseCommandQueue(deviceQueue_);
+  }
+  if (NULL != kernel2_) {
+    _wrapper->clReleaseKernel(kernel2_);
+  }
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue.h
new file mode 100644
index 0000000000..c1a033fb48
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCLPERF_DEVICE_ENQUEUE_H_
+#define _OCLPERF_DEVICE_ENQUEUE_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfDeviceEnqueue : public OCLTestImp {
+ public:
+  OCLPerfDeviceEnqueue();
+  virtual ~OCLPerfDeviceEnqueue();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  cl_command_queue deviceQueue_;
+  bool failed_;
+  unsigned int testID_;
+  cl_kernel kernel2_;
+  unsigned int testListSize;
+  unsigned int threads;
+  cl_uint queueSize;
+};
+
+#endif  // _OCLPERF_DEVICE_ENQUEUE_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue2.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue2.cpp
new file mode 100644
index 0000000000..67835dc173
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue2.cpp
@@ -0,0 +1,260 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfDeviceEnqueue2.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define KERNEL_CODE(...) #__VA_ARGS__
+
+typedef struct {
+  unsigned int threads;
+} testStruct;
+
+static testStruct testList[] = {
+    {64}, {128}, {256}, {512}, {1024}, {2048}, {4096},
+};
+
+static unsigned int qsizeList[] = {
+    16, 32, 64, 128, 256, 512,
+};
+
+static unsigned int levelList[] = {
+    1,
+    2,
+    4,
+    8,
+};
+
+const static char* strKernel = {KERNEL_CODE(
+  \n __kernel void childKernel(__global uint* buf, uint level) {
+  if (level) {
+    queue_t def_q = get_default_queue();
+    ndrange_t ndrange = ndrange_1D(64, 64);
+    int gid = get_global_id(0);
+    int lid = get_local_id(0);
+    if (lid == 0) {
+      int enq_res =
+          enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, ^{
+            childKernel(buf, level - 1);
+          });
+    }
+  } else {
+    int idx = get_global_id(0);
+    if (idx < 0) {
+      buf[idx] = 0;
+    }
+  }
+}
+  \n __kernel void parentKernel(__global uint* buf, uint level) {
+  queue_t def_q = get_default_queue();
+  ndrange_t ndrange = ndrange_1D(64, 64);
+  int gid = get_global_id(0);
+
+  if (level) {
+    int enq_res =
+        enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, ^{
+          childKernel(buf, level - 1);
+        });
+  }
+}
+  \n)};
+
+OCLPerfDeviceEnqueue2::OCLPerfDeviceEnqueue2() {
+  subTests_level = sizeof(levelList) / sizeof(unsigned int);
+  subTests_qsize = (sizeof(qsizeList) / sizeof(unsigned int));
+  subTests_thread = sizeof(testList) / sizeof(testStruct);
+  testListSize = subTests_thread;
+  _numSubTests = subTests_level * subTests_qsize * subTests_thread;
+  deviceQueue_ = NULL;
+  failed_ = false;
+  kernel2_ = NULL;
+  level = 2;
+  skip_ = false;
+}
+
+OCLPerfDeviceEnqueue2::~OCLPerfDeviceEnqueue2() {}
+
+void OCLPerfDeviceEnqueue2::open(unsigned int test, char* units,
+                                 double& conversion, unsigned int deviceId) {
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  testID_ = test;
+
+  threads = testList[testID_ / (subTests_qsize * subTests_level)].threads;
+  queueSize = qsizeList[(testID_ / subTests_level) % subTests_qsize] * 1024;
+  level = levelList[testID_ % subTests_level];
+
+  size_t param_size = 0;
+  char* strVersion = 0;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
+                                     0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strVersion = new char[param_size];
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
+                                     param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strVersion[7] < '2') {
+    failed_ = true;
+    return;
+  }
+  delete strVersion;
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "parentKernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  kernel2_ = _wrapper->clCreateKernel(program_, "childKernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR, 2048, NULL,
+                                    &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+
+#if defined(CL_VERSION_2_0)
+  const cl_queue_properties cprops[] = {
+      CL_QUEUE_PROPERTIES,
+      static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+                                       CL_QUEUE_ON_DEVICE_DEFAULT |
+                                       CL_QUEUE_ON_DEVICE),
+      CL_QUEUE_SIZE, queueSize, 0};
+  deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties(
+      context_, devices_[deviceId], cprops, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateCommandQueueWithProperties() failed");
+#else
+  skip_ = true;
+  testDescString =
+      "DeviceEnqueue NOT supported for < 2.0 builds. Test Skipped.";
+  return;
+#endif
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLPerfDeviceEnqueue2::run(void) {
+  CPerfCounter timer;
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  if (failed_) {
+    return;
+  }
+
+  if (skip_) {
+    return;
+  }
+
+  cl_mem buffer = buffers()[0];
+
+  size_t gws[1] = {threads};
+  size_t lws[1] = {64};
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(unsigned int), &level);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, lws, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  // Try to normalize the amount of work per test
+  // unsigned int repeats = (4096 / threads) * 100 ;
+  unsigned int repeats = (4096 / threads) * 10;
+  // unsigned int repeats = 100;
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < repeats; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, gws, lws, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+    _wrapper->clFinish(cmdQueues_[_deviceId]);
+  }
+  timer.Stop();
+
+  double sec = timer.GetElapsedTime();
+
+  _perfInfo = (float)(threads * repeats * level) / (float)(sec * 1000000.);
+  char buf[256];
+  SNPRINTF(
+      buf, sizeof(buf),
+      "%5d threads spawning 64 threads, queue size %3dKB (Mdisp/s), level=%2d",
+      threads, queueSize / 1024, level);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfDeviceEnqueue2::close(void) {
+  // FIXME: Re-enable CPU test once bug 10143 is fixed.
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return 0;
+  }
+
+  if (deviceQueue_) {
+    error_ = _wrapper->clReleaseCommandQueue(deviceQueue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (kernel2_) {
+    error_ = _wrapper->clReleaseKernel(kernel2_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue2.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue2.h
new file mode 100644
index 0000000000..2a4bde8ced
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue2.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCLPERF_DEVICE_ENQUEUE2_H_
+#define _OCLPERF_DEVICE_ENQUEUE2_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfDeviceEnqueue2 : public OCLTestImp {
+ public:
+  OCLPerfDeviceEnqueue2();
+  virtual ~OCLPerfDeviceEnqueue2();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  cl_command_queue deviceQueue_;
+  unsigned int testID_;
+  cl_kernel kernel2_;
+  unsigned int testListSize;
+  unsigned int threads;
+  cl_uint queueSize;
+  unsigned int subTests_level;
+  unsigned int subTests_qsize;
+  unsigned int subTests_thread;
+  unsigned int level;
+  unsigned int lws_value;
+
+  bool failed_;
+  bool skip_;
+};
+
+#endif  // _OCLPERF_DEVICE_ENQUEUE2_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueEvent.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueEvent.cpp
new file mode 100644
index 0000000000..6fa7dcab50
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueEvent.cpp
@@ -0,0 +1,267 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfDeviceEnqueueEvent.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define KERNEL_CODE(...) #__VA_ARGS__
+
+typedef struct {
+  unsigned int threads;
+} testStruct;
+
+static testStruct testList[] = {
+    {64}, {128}, {256}, {512}, {1024}, {2048}, {4096},
+};
+
+static unsigned int qsizeList[] = {
+    16, 32, 64, 128, 256, 512,
+};
+
+static unsigned int levelList[] = {
+    1,
+    2,
+    4,
+    8,
+};
+
+const static char* strKernel = {KERNEL_CODE(
+  \n __kernel void childKernel(__global uint* buf, uint level,
+                                clk_event_t wait_evt) {
+  int idx = get_global_id(0);
+  if (idx < 0) {
+    buf[idx] = 0;
+  }
+}
+  \n __kernel void parentKernel(__global uint* buf, uint level) {
+  if (level) {
+    queue_t def_q = get_default_queue();
+    ndrange_t ndrange = ndrange_1D(64, 64);
+    clk_event_t user_evt = create_user_event();
+    clk_event_t block_evt, wait_evt;
+    wait_evt = user_evt;
+
+    for (uint i = 0; i < level; i++) {
+      int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 0,
+                                   /*&user_evt*/ NULL, &block_evt, ^{
+                                     childKernel(buf, level - 1, block_evt);
+                                   });
+
+      // wait_evt = block_evt;
+    }
+    if (is_valid_event(user_evt)) {
+      set_user_event_status(user_evt, CL_COMPLETE);
+      release_event(user_evt);
+    }
+  } else {
+    int idx = get_global_id(0);
+    if (idx < 0) {
+      buf[idx] = 0;
+    }
+  }
+}
+  \n)};
+
+OCLPerfDeviceEnqueueEvent::OCLPerfDeviceEnqueueEvent() {
+  subTests_level = sizeof(levelList) / sizeof(unsigned int);
+  subTests_qsize = (sizeof(qsizeList) / sizeof(unsigned int));
+  subTests_thread = sizeof(testList) / sizeof(testStruct);
+  testListSize = subTests_thread;
+  //_numSubTests  = 2*testListSize + subTests_level + subTests_qsize;
+  _numSubTests = subTests_level * subTests_qsize * subTests_thread;
+  deviceQueue_ = NULL;
+  failed_ = false;
+  skip_ = false;
+  kernel2_ = NULL;
+  level = 2;
+}
+
+OCLPerfDeviceEnqueueEvent::~OCLPerfDeviceEnqueueEvent() {}
+
+void OCLPerfDeviceEnqueueEvent::open(unsigned int test, char* units,
+                                     double& conversion,
+                                     unsigned int deviceId) {
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  testID_ = test;
+
+  threads = testList[testID_ / (subTests_qsize * subTests_level)].threads;
+  queueSize = qsizeList[(testID_ / subTests_level) % subTests_qsize] * 1024;
+  level = levelList[testID_ % subTests_level];
+
+  lws_value = 64;
+
+  size_t param_size = 0;
+  char* strVersion = 0;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
+                                     0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strVersion = new char[param_size];
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
+                                     param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strVersion[7] < '2') {
+    failed_ = true;
+    return;
+  }
+  delete strVersion;
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "parentKernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  kernel2_ = _wrapper->clCreateKernel(program_, "childKernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR, 2048, NULL,
+                                    &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+
+#if defined(CL_VERSION_2_0)
+  const cl_queue_properties cprops[] = {
+      CL_QUEUE_PROPERTIES,
+      static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+                                       CL_QUEUE_ON_DEVICE_DEFAULT |
+                                       CL_QUEUE_ON_DEVICE),
+      CL_QUEUE_SIZE, queueSize, 0};
+  deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties(
+      context_, devices_[deviceId], cprops, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateCommandQueueWithProperties() failed");
+#else
+  skip_ = true;
+  testDescString =
+      "DeviceEnqueue NOT supported for < 2.0 builds. Test Skipped.";
+  return;
+#endif
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLPerfDeviceEnqueueEvent::run(void) {
+  CPerfCounter timer;
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  if (failed_) {
+    return;
+  }
+
+  if (skip_) {
+    return;
+  }
+
+  cl_mem buffer = buffers()[0];
+
+  size_t gws[1] = {threads};
+  size_t lws[1] = {lws_value};
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(unsigned int), &level);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, lws, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  // Try to normalize the amount of work per test
+  // unsigned int repeats = (4096 / threads) * 100 ;
+  unsigned int repeats = (4096 / threads) * 10;
+  // unsigned int repeats = 100;
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < repeats; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, gws, lws, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+    _wrapper->clFinish(cmdQueues_[_deviceId]);
+  }
+  timer.Stop();
+
+  double sec = timer.GetElapsedTime();
+
+  _perfInfo = (float)(threads * repeats * level) / (float)(sec * 1000000.);
+  char buf[256];
+  SNPRINTF(
+      buf, sizeof(buf),
+      "%5d threads spawning %2d threads, queue size %3dKB (Mdisp/s), level=%2d",
+      threads, lws_value, queueSize / 1024, level);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfDeviceEnqueueEvent::close(void) {
+  // FIXME: Re-enable CPU test once bug 10143 is fixed.
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return 0;
+  }
+
+  if (deviceQueue_) {
+    error_ = _wrapper->clReleaseCommandQueue(deviceQueue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (kernel2_) {
+    error_ = _wrapper->clReleaseKernel(kernel2_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueEvent.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueEvent.h
new file mode 100644
index 0000000000..f7c37c3f51
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueEvent.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCLPERF_DEVICE_ENQUEUE_EVENT_H_
+#define _OCLPERF_DEVICE_ENQUEUE_EVENT_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfDeviceEnqueueEvent : public OCLTestImp {
+ public:
+  OCLPerfDeviceEnqueueEvent();
+  virtual ~OCLPerfDeviceEnqueueEvent();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  cl_command_queue deviceQueue_;
+  unsigned int testID_;
+  cl_kernel kernel2_;
+  unsigned int testListSize;
+  unsigned int threads;
+  cl_uint queueSize;
+  unsigned int subTests_level;
+  unsigned int subTests_qsize;
+  unsigned int subTests_thread;
+  unsigned int level;
+  unsigned int lws_value;
+
+  bool failed_;
+  bool skip_;
+};
+
+#endif  // _OCLPERF_DEVICE_ENQUEUE_EVENT_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueSier.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueSier.cpp
new file mode 100644
index 0000000000..da048933f8
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueSier.cpp
@@ -0,0 +1,233 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfDeviceEnqueueSier.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define KERNEL_CODE(...) #__VA_ARGS__
+
+typedef struct {
+  unsigned int threads;
+} testStruct;
+
+static unsigned int sizeList[] = {
+    81, 243, 729, 2187, 6561, 19683, 59049,
+};
+
+const static char* strKernel = {KERNEL_CODE(
+    \n __kernel void parentKernel(__global uint* buf, int width, int offsetx,
+                                   int offsety) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  queue_t q = get_default_queue();
+
+  int one_third = get_global_size(0) / 3;
+  int two_thirds = 2 * one_third;
+
+  if (x >= one_third && x < two_thirds && y >= one_third && y < two_thirds) {
+    int idx = get_global_id(0);
+    if (idx < 0) {
+      buf[idx] = 0;
+    }
+  } else {
+    if (one_third > 1 && x % one_third == 0 && y % one_third == 0) {
+      const size_t grid[2] = {one_third, one_third};
+      enqueue_kernel(q, 0, ndrange_2D(grid), ^{
+        parentKernel(buf, width, x + offsetx, y + offsety);
+      });
+    }
+  }
+}
+    \n)};
+
+OCLPerfDeviceEnqueueSier::OCLPerfDeviceEnqueueSier() {
+  _numSubTests = sizeof(sizeList) / sizeof(unsigned int);
+  deviceQueue_ = NULL;
+  failed_ = false;
+  skip_ = false;
+}
+
+OCLPerfDeviceEnqueueSier::~OCLPerfDeviceEnqueueSier() {}
+
+void OCLPerfDeviceEnqueueSier::open(unsigned int test, char* units,
+                                    double& conversion, unsigned int deviceId) {
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  testID_ = test;
+
+  size_t param_size = 0;
+  char* strVersion = 0;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
+                                     0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strVersion = new char[param_size];
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
+                                     param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strVersion[7] < '2') {
+    failed_ = true;
+    return;
+  }
+  delete strVersion;
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "parentKernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR, 2048, NULL,
+                                    &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+
+  queueSize = 512 * 1024;
+
+  image_size = sizeList[testID_];
+
+#if defined(CL_VERSION_2_0)
+  const cl_queue_properties cprops[] = {
+      CL_QUEUE_PROPERTIES,
+      static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+                                       CL_QUEUE_ON_DEVICE_DEFAULT |
+                                       CL_QUEUE_ON_DEVICE),
+      CL_QUEUE_SIZE, queueSize, 0};
+  deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties(
+      context_, devices_[deviceId], cprops, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateCommandQueueWithProperties() failed");
+#else
+  skip_ = true;
+  testDescString =
+      "DeviceEnqueue NOT supported for < 2.0 builds. Test Skipped.";
+  return;
+#endif
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLPerfDeviceEnqueueSier::run(void) {
+  CPerfCounter timer;
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  if (failed_) {
+    return;
+  }
+
+  if (skip_) {
+    return;
+  }
+
+  cl_mem buffer = buffers()[0];
+
+  size_t gws[1] = {1};
+  size_t lws[1] = {0};
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  int width = image_size, offsetx = 0, offsety = 0;
+  error_ |= _wrapper->clSetKernelArg(kernel_, 1, sizeof(int), (void*)&width);
+  error_ |= _wrapper->clSetKernelArg(kernel_, 2, sizeof(int), (void*)&offsetx);
+  error_ |= _wrapper->clSetKernelArg(kernel_, 3, sizeof(int), (void*)&offsety);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, 0, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  size_t global_work_size[2] = {image_size, image_size};
+
+  // Try to normalize the amount of work per test
+  unsigned int repeats = 100;
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < repeats; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2,
+                                              NULL, global_work_size, 0, 0,
+                                              NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+    _wrapper->clFinish(cmdQueues_[_deviceId]);
+  }
+  timer.Stop();
+
+  double sec = timer.GetElapsedTime();
+
+  unsigned int numOfKernels = (int)pow(8.0, log(image_size) / log(3) - 1);
+  _perfInfo = (float)(numOfKernels * repeats) / (float)(sec * 1000000.);
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), "image_size = %5d, queue size %3dKB (Mdisp/s)",
+           image_size, queueSize / 1024);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfDeviceEnqueueSier::close(void) {
+  // FIXME: Re-enable CPU test once bug 10143 is fixed.
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return 0;
+  }
+
+  if (deviceQueue_) {
+    error_ = _wrapper->clReleaseCommandQueue(deviceQueue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueSier.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueSier.h
new file mode 100644
index 0000000000..dc4f5132cd
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueSier.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCLPERF_DEVICE_ENQUEUE_SIER_H_
+#define _OCLPERF_DEVICE_ENQUEUE_SIER_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfDeviceEnqueueSier : public OCLTestImp {
+ public:
+  OCLPerfDeviceEnqueueSier();
+  virtual ~OCLPerfDeviceEnqueueSier();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  cl_command_queue deviceQueue_;
+  unsigned int testID_;
+  unsigned int testListSize;
+  // unsigned int        threads;
+  cl_uint queueSize;
+  unsigned int image_size;
+
+  bool failed_;
+  bool skip_;
+};
+
+#endif  // _OCLPERF_DEVICE_ENQUEUE_SIER_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDispatchSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDispatchSpeed.cpp
new file mode 100644
index 0000000000..8ebef5c33c
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDispatchSpeed.cpp
@@ -0,0 +1,391 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfDispatchSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define CHAR_BUF_SIZE 512
+
+typedef struct {
+  unsigned int iterations;
+  int flushEvery;
+} testStruct;
+
+testStruct testList[] = {
+    {1, -1},         {1, -1},      {10, 1},      {10, -1},      {100, 1},
+    {100, 10},       {100, -1},    {1000, 1},    {1000, 10},    {1000, 100},
+    {1000, -1},      {10000, 1},   {10000, 10},  {10000, 100},  {10000, 1000},
+    {10000, -1},     {100000, 1},  {100000, 10}, {100000, 100}, {100000, 1000},
+    {100000, 10000}, {100000, -1},
+};
+
+unsigned int mapTestList[] = {1, 1, 10, 100, 1000, 10000, 100000};
+
+void OCLPerfDispatchSpeed::genShader(void) {
+  shader_.clear();
+  shader_ +=
+      "__kernel void _dispatchSpeed(__global float *outBuf)\n"
+      "{\n"
+      "    int i = (int) get_global_id(0);\n"
+      "    if (i < 0)\n"
+      "        outBuf[i] = 0.0f;\n"
+      "}\n";
+}
+
+OCLPerfDispatchSpeed::OCLPerfDispatchSpeed() {
+  testListSize = sizeof(testList) / sizeof(testStruct);
+  _numSubTests = 2 * 2 * testListSize;
+}
+
+OCLPerfDispatchSpeed::~OCLPerfDispatchSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfDispatchSpeed::open(unsigned int test, char *units,
+                                double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test % testListSize;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  outBuffer_ = 0;
+  sleep = false;
+  doWarmup = false;
+
+  if ((test / testListSize) % 2) {
+    doWarmup = true;
+  }
+  if (test >= (testListSize * 2)) {
+    sleep = true;
+  }
+
+  bufSize_ = 64 * sizeof(cl_float);
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    // if (num_devices > 0)
+    //{
+    //    platform = platforms[_platformIndex];
+    //    break;
+    //}
+#if 0
+        }
+#endif
+    delete platforms;
+  } else {
+    CHECK_RESULT(numPlatforms == 0, "No platforms available!");
+  }
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  genShader();
+  char *tmp = (char *)shader_.c_str();
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&tmp, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &device, "", NULL, NULL);
+
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "_dispatchSpeed", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer_);
+}
+
+void OCLPerfDispatchSpeed::run(void) {
+  int global = bufSize_ / sizeof(cl_float);
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  CPerfCounter timer;
+  cl_event event;
+  cl_int eventStatus;
+
+  if (doWarmup) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, &event);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+    _wrapper->clFinish(cmd_queue_);
+  }
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < testList[_openTest].iterations; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, &event);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+    if ((testList[_openTest].flushEvery > 0) &&
+        (((i + 1) % testList[_openTest].flushEvery) == 0)) {
+      if (sleep) {
+        _wrapper->clFinish(cmd_queue_);
+      } else {
+        _wrapper->clFlush(cmd_queue_);
+        error_ =
+            _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                     sizeof(cl_int), &eventStatus, NULL);
+        while (eventStatus > 0) {
+          error_ =
+              _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                       sizeof(cl_int), &eventStatus, NULL);
+        }
+      }
+    }
+    if (i != (testList[_openTest].iterations - 1)) {
+      _wrapper->clReleaseEvent(event);
+    }
+  }
+  if (sleep) {
+    _wrapper->clFinish(cmd_queue_);
+  } else {
+    _wrapper->clFlush(cmd_queue_);
+    error_ = _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                      sizeof(cl_int), &eventStatus, NULL);
+    while (eventStatus > 0) {
+      error_ =
+          _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                   sizeof(cl_int), &eventStatus, NULL);
+    }
+  }
+  _wrapper->clReleaseEvent(event);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // microseconds per launch
+  double perf = (1000000.f * sec / testList[_openTest].iterations);
+  const char *waitType;
+  const char *extraChar;
+  const char *n;
+  const char *warmup;
+  if (sleep) {
+    waitType = "sleep";
+    extraChar = "";
+    n = "";
+  } else {
+    waitType = "spin";
+    n = "n";
+    extraChar = " ";
+  }
+  if (doWarmup) {
+    warmup = "warmup";
+  } else {
+    warmup = "";
+  }
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  if (testList[_openTest].flushEvery > 0) {
+    SNPRINTF(buf, sizeof(buf),
+             " %7d dispatches %s%sing every %5d %6s (us/disp)",
+             testList[_openTest].iterations, waitType, n,
+             testList[_openTest].flushEvery, warmup);
+  } else {
+    SNPRINTF(buf, sizeof(buf),
+             " %7d dispatches (%s%s)              %6s (us/disp)",
+             testList[_openTest].iterations, waitType, extraChar, warmup);
+  }
+  testDescString = buf;
+}
+
+unsigned int OCLPerfDispatchSpeed::close(void) {
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
+
+OCLPerfMapDispatchSpeed::OCLPerfMapDispatchSpeed() {
+  testListSize = sizeof(mapTestList) / sizeof(unsigned int);
+  _numSubTests = 2 * testListSize;
+}
+
+void OCLPerfMapDispatchSpeed::run(void) {
+  cl_mem outBuffer;
+  outBuffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR,
+                                       bufSize_, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer);
+
+  int global = bufSize_ / sizeof(cl_float);
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  CPerfCounter timer;
+
+  if (doWarmup) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+    _wrapper->clFinish(cmd_queue_);
+  }
+
+  timer.Reset();
+  timer.Start();
+  void *mem;
+  for (unsigned int i = 0; i < mapTestList[_openTest]; i++) {
+    mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer, CL_TRUE,
+                                       CL_MAP_WRITE_INVALIDATE_REGION, 0,
+                                       bufSize_, 0, NULL, NULL, &error_);
+
+    CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+    error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer, mem, 0,
+                                               NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed");
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // microseconds per launch
+  double perf = (1000000.f * sec / mapTestList[_openTest]);
+  const char *warmup;
+  if (doWarmup) {
+    warmup = "warmup";
+  } else {
+    warmup = "";
+  }
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " %7d maps and dispatches %6s (us/disp)",
+           mapTestList[_openTest], warmup);
+  testDescString = buf;
+
+  _wrapper->clReleaseMemObject(outBuffer);
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDispatchSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDispatchSpeed.h
new file mode 100644
index 0000000000..2dfc7bd70b
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDispatchSpeed.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_DispatchSpeed_H_
+#define _OCL_DispatchSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfDispatchSpeed : public OCLTestImp {
+ public:
+  OCLPerfDispatchSpeed();
+  virtual ~OCLPerfDispatchSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void genShader(void);
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_mem outBuffer_;
+  cl_int error_;
+  bool doWarmup;
+
+  unsigned int bufSize_;
+  bool sleep;
+  unsigned int testListSize;
+};
+
+class OCLPerfMapDispatchSpeed : public OCLPerfDispatchSpeed {
+ public:
+  OCLPerfMapDispatchSpeed();
+  virtual void run(void);
+};
+#endif  // _OCL_DispatchSpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMA.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMA.cpp
new file mode 100644
index 0000000000..6315e1a151
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMA.cpp
@@ -0,0 +1,442 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfDoubleDMA.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include <cmath>
+#include <sstream>
+#include <string>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+const size_t blockX = 256;
+const size_t blockY = 256;
+const size_t blockZ = 512;
+
+const size_t chunk = 16;
+const size_t size_S = blockX * blockY * blockZ * sizeof(cl_float4);
+const size_t size_s = blockX * blockY * chunk * sizeof(cl_float4);
+static const int WindowWidth = 80;
+
+const size_t MaxQueues = 3;
+bool profEnable = false;
+
+static const char* strKernel =
+    "__kernel void dummy(__global float4* out)  \n"
+    "{                                          \n"
+    "   uint id = get_global_id(0);             \n"
+    "   float4 value = (float4)(1.0f, 2.0f, 3.0f, 4.0f);  \n"
+    "   uint factorial = 1;                     \n"
+    "   for (uint i = 1; i < (id / 0x400); ++i)\n"
+    "   {                                       \n"
+    "       factorial *= i;                     \n"
+    "   }                                       \n"
+    "   out[id] = value * factorial;            \n"
+    "}                                          \n";
+
+class ProfileQueue {
+ public:
+  enum Operation { Write = 0, Execute, Read, Total };
+
+  static const char* OperationName[Total];
+  static const char StartCommand[Total];
+  static const char ExecCommand[Total];
+
+  ProfileQueue() {}
+  ~ProfileQueue() {
+    for (size_t op = 0; op < Total; ++op) {
+      for (size_t idx = 0; idx < events_[op].size(); ++idx) {
+        clReleaseEvent(events_[op][idx]);
+      }
+    }
+  }
+
+  void addEvent(Operation op, cl_event event) { events_[op].push_back(event); }
+
+  void findMinMax(cl_long* min, cl_long* max) {
+    // Find time min/max ranges for the frame scaling
+    for (size_t op = 0; (op < ProfileQueue::Total); ++op) {
+      cl_long time;
+      if (events_[op].size() == 0) continue;
+      clGetEventProfilingInfo(events_[op][0], CL_PROFILING_COMMAND_START,
+                              sizeof(cl_long), &time, NULL);
+      if (0 == *min) {
+        *min = time;
+      } else {
+        *min = std::min(*min, time);
+      }
+      clGetEventProfilingInfo(events_[op][events_[op].size() - 1],
+                              CL_PROFILING_COMMAND_END, sizeof(cl_long), &time,
+                              NULL);
+      if (0 == *max) {
+        *max = time;
+      } else {
+        *max = std::max(*max, time);
+      }
+    }
+  }
+
+  void display(cl_long start, cl_long finish) {
+    std::string graph;
+    graph.resize(WindowWidth + 1);
+    graph[WindowWidth] = '\x0';
+    cl_long timeFrame = finish - start;
+    cl_long interval = timeFrame / WindowWidth;
+
+    // Find time min/max ranges for the frame scaling
+    for (size_t op = 0; (op < Total); ++op) {
+      if (events_[op].size() == 0) continue;
+      cl_long timeStart, timeEnd;
+      int begin = 0, end = 0;
+      for (size_t idx = 0; idx < events_[op].size(); ++idx) {
+        bool cutStart = false;
+        clGetEventProfilingInfo(events_[op][idx], CL_PROFILING_COMMAND_START,
+                                sizeof(cl_long), &timeStart, NULL);
+        clGetEventProfilingInfo(events_[op][idx], CL_PROFILING_COMMAND_END,
+                                sizeof(cl_long), &timeEnd, NULL);
+
+        // Continue if out of the frame scope
+        if (timeStart >= finish) continue;
+        if (timeEnd <= start) continue;
+
+        if (timeStart <= start) {
+          timeStart = start;
+          cutStart = true;
+        }
+
+        if (timeEnd >= finish) {
+          timeEnd = finish;
+        }
+
+        // Readjust time to the frame
+        timeStart -= start;
+        timeEnd -= start;
+        timeStart = static_cast<cl_long>(
+            floor(static_cast<float>(timeStart) / interval + 0.5f));
+        timeEnd = static_cast<cl_long>(
+            floor(static_cast<float>(timeEnd) / interval + 0.5f));
+        begin = static_cast<int>(timeStart);
+        // Idle from end to begin
+        for (int c = end; c < begin; ++c) {
+          graph[c] = '-';
+        }
+        end = static_cast<int>(timeEnd);
+        for (int c = begin; c < end; ++c) {
+          if ((c == begin) && !cutStart) {
+            graph[c] = StartCommand[op];
+          } else {
+            graph[c] = ExecCommand[op];
+          }
+        }
+        if ((begin == end) && (end < WindowWidth)) {
+          graph[begin] = '+';
+        }
+      }
+      if (end < WindowWidth) {
+        for (int c = end; c < WindowWidth; ++c) {
+          graph[c] = '-';
+        }
+      }
+      printf("%s\n", graph.c_str());
+    }
+  }
+
+ private:
+  // Profiling events
+  std::vector<cl_event> events_[Total];
+};
+
+const char* ProfileQueue::OperationName[Total] = {
+    "BufferWrite", "KernelExecution", "BufferRead"};
+const char ProfileQueue::StartCommand[Total] = {'W', 'X', 'R'};
+const char ProfileQueue::ExecCommand[Total] = {'>', '#', '<'};
+
+class Profile {
+ public:
+  Profile(bool profEna, int numQueues)
+      : profileEna_(profEna),
+        numQueues_(numQueues),
+        min_(0),
+        max_(0),
+        execTime_(0) {}
+
+  ~Profile() {}
+
+  void addEvent(int queue, ProfileQueue::Operation op, cl_event event) {
+    if (profileEna_) {
+      profQueue[queue].addEvent(op, event);
+    }
+  }
+
+  cl_long findExecTime() {
+    if (execTime_ != 0) return execTime_;
+    for (int q = 0; q < numQueues_; ++q) {
+      profQueue[q].findMinMax(&min_, &max_);
+    }
+    execTime_ = max_ - min_;
+    return execTime_;
+  }
+
+  void display(cl_long start, cl_long finish) {
+    if (!profileEna_) return;
+    printf("\n ----------- Time frame %.3f (us), scale 1:%.0f\n",
+           (float)(finish - start) / 1000,
+           (float)(finish - start) / (1000 * WindowWidth));
+    for (size_t op = 0; (op < ProfileQueue::Total); ++op) {
+      printf("%s - %c%c; ", ProfileQueue::OperationName[op],
+             ProfileQueue::StartCommand[op], ProfileQueue::ExecCommand[op]);
+    }
+    printf("\n");
+    for (int q = 0; q < numQueues_; ++q) {
+      printf("CommandQueue #%d\n", q);
+      profQueue[q].display(min_ + start, min_ + finish);
+    }
+  }
+
+ private:
+  bool profileEna_;
+  int numQueues_;     //!< Total number of queues
+  cl_long min_;       //!< Min HW timestamp
+  cl_long max_;       //!< Max HW timestamp
+  cl_long execTime_;  //!< Profile time
+  ProfileQueue profQueue[MaxQueues];
+};
+
+OCLPerfDoubleDMA::OCLPerfDoubleDMA() {
+  _numSubTests = 2 * MaxQueues * 2;
+  failed_ = false;
+}
+
+OCLPerfDoubleDMA::~OCLPerfDoubleDMA() {}
+
+void OCLPerfDoubleDMA::open(unsigned int test, char* units, double& conversion,
+                            unsigned int deviceId) {
+  _deviceId = deviceId;
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  test_ = test;
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    printf("GPU device is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+  kernel_ = _wrapper->clCreateKernel(program_, "dummy", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  size_t bufSize = size_s;
+  cl_mem buffer;
+  if (test_ >= (2 * MaxQueues)) {
+    profEnable = true;
+  }
+  test_ %= 2 * MaxQueues;
+  size_t numBufs = (test_ % MaxQueues) + 1;
+  for (size_t b = 0; b < numBufs; ++b) {
+    buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, bufSize,
+                                      NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+    buffers_.push_back(buffer);
+  }
+
+  buffer = _wrapper->clCreateBuffer(context_,
+                                    CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                                    size_S, NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLPerfDoubleDMA::run(void) {
+  if (failed_) {
+    return;
+  }
+  CPerfCounter timer;
+  const int numQueues = (test_ % MaxQueues) + 1;
+  const bool useKernel = ((test_ / MaxQueues) > 0);
+  const int numBufs = numQueues;
+  Profile profile(profEnable, numQueues);
+
+  std::vector<cl_command_queue> cmdQueues(numQueues);
+  int q;
+  cl_command_queue_properties qProp =
+      (profEnable) ? CL_QUEUE_PROFILING_ENABLE : 0;
+  for (q = 0; q < numQueues; ++q) {
+    cl_command_queue cmdQueue = _wrapper->clCreateCommandQueue(
+        context_, devices_[_deviceId], qProp, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed");
+    cmdQueues[q] = cmdQueue;
+  }
+
+  float* Data_s = (float*)_wrapper->clEnqueueMapBuffer(
+      cmdQueues[0], buffers_[numBufs], CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
+      size_S, 0, NULL, NULL, &error_);
+
+  size_t gws[1] = {size_s / (4 * sizeof(float))};
+  size_t lws[1] = {256};
+
+  // Warm-up
+  for (q = 0; q < numQueues; ++q) {
+    error_ |=
+        _wrapper->clEnqueueWriteBuffer(cmdQueues[q], buffers_[q], CL_FALSE, 0,
+                                       size_s, (char*)Data_s, 0, NULL, NULL);
+    error_ |= _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                       (void*)&buffers_[q]);
+    error_ |= _wrapper->clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL,
+                                               gws, lws, 0, NULL, NULL);
+    error_ |=
+        _wrapper->clEnqueueReadBuffer(cmdQueues[q], buffers_[q], CL_FALSE, 0,
+                                      size_s, (char*)Data_s, 0, NULL, NULL);
+    error_ |= _wrapper->clFinish(cmdQueues[q]);
+  }
+
+  CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "Execution failed");
+
+  size_t s_done = 0;
+  cl_event r[MaxQueues] = {0}, w[MaxQueues] = {0}, x[MaxQueues] = {0};
+
+  /*----------  pass2:  copy Data_s to and from GPU Buffers ----------*/
+  s_done = 0;
+  timer.Reset();
+  timer.Start();
+  int idx = numBufs - 1;
+  // Start from the last so read/write won't go to the same DMA when kernel is
+  // executed
+  q = numQueues - 1;
+  size_t iter = 0;
+  while (1) {
+    if (0 == r[idx]) {
+      error_ |= _wrapper->clEnqueueWriteBuffer(
+          cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s,
+          (char*)Data_s + s_done, 0, NULL, &w[idx]);
+    } else {
+      error_ |= _wrapper->clEnqueueWriteBuffer(
+          cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s,
+          (char*)Data_s + s_done, 1, &r[idx], &w[idx]);
+      if (!profEnable) {
+        error_ |= _wrapper->clReleaseEvent(r[idx]);
+      }
+    }
+    _wrapper->clFlush(cmdQueues[q]);
+    profile.addEvent(q, ProfileQueue::Write, w[idx]);
+
+    if (useKernel) {
+      // Change the queue
+      ++q %= numQueues;
+      // Implicit flush of DMA engine on kernel start, because memory dependency
+      error_ |= _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                         (void*)&buffers_[idx]);
+      error_ |= _wrapper->clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL,
+                                                 gws, lws, 1, &w[idx], &x[idx]);
+      if (!profEnable) {
+        error_ |= _wrapper->clReleaseEvent(w[idx]);
+      }
+      profile.addEvent(q, ProfileQueue::Execute, x[idx]);
+    }
+    _wrapper->clFlush(cmdQueues[q]);
+
+    // Change the queue
+    ++q %= numQueues;
+    error_ |= _wrapper->clEnqueueReadBuffer(
+        cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s,
+        (char*)Data_s + s_done, 1, (useKernel) ? &x[idx] : &w[idx], &r[idx]);
+    if (!profEnable) {
+      error_ |= _wrapper->clReleaseEvent((useKernel) ? x[idx] : w[idx]);
+    }
+    profile.addEvent(q, ProfileQueue::Read, r[idx]);
+    _wrapper->clFlush(cmdQueues[q]);
+
+    if ((s_done += size_s) >= size_S) {
+      if (!profEnable) {
+        error_ |= _wrapper->clReleaseEvent(r[idx]);
+      }
+      break;
+    }
+    ++iter;
+    ++idx %= numBufs;
+    ++q %= numQueues;
+  }
+
+  for (q = 0; q < numQueues; ++q) {
+    error_ |= _wrapper->clFinish(cmdQueues[q]);
+  }
+  timer.Stop();
+
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues[0], buffers_[numBufs],
+                                             Data_s, 0, NULL, NULL);
+
+  error_ |= _wrapper->clFinish(cmdQueues[0]);
+  CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "Execution failed");
+
+  cl_long gpuTimeFrame = profile.findExecTime();
+  cl_long oneIter = gpuTimeFrame / iter;
+
+  // Display 4 iterations in the middle
+  cl_long startFrame = oneIter * (iter / 2 - 2);
+  cl_long finishFrame = oneIter * (iter / 2 + 2);
+  profile.display(startFrame, finishFrame);
+
+  for (q = 0; q < numQueues; ++q) {
+    error_ = _wrapper->clReleaseCommandQueue(cmdQueues[q]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseCommandQueue() failed");
+  }
+
+  double GBytes = (double)(2 * size_S) / (double)(1000 * 1000 * 1000);
+  _perfInfo = static_cast<float>(GBytes / timer.GetElapsedTime());
+
+  std::stringstream stream;
+  if (useKernel) {
+    stream << "Write/Kernel/Read operation ";
+  } else {
+    stream << "Write/Read operation ";
+  }
+  stream << numQueues << " queues; profiling "
+         << ((profEnable) ? "enabled" : "disabled") << " [GB/s]";
+
+  stream.flags(std::ios::right | std::ios::showbase);
+  testDescString = stream.str();
+}
+
+unsigned int OCLPerfDoubleDMA::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMA.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMA.h
new file mode 100644
index 0000000000..5eb0d6d060
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMA.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PERF_DOUBLE_DMA_H_
+#define _OCL_PERF_DOUBLE_DMA_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfDoubleDMA : public OCLTestImp {
+ public:
+  OCLPerfDoubleDMA();
+  virtual ~OCLPerfDoubleDMA();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+  unsigned int test_;
+};
+
+#endif  // _OCL_PERF_DOUBLE_DMA_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMASeq.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMASeq.cpp
new file mode 100644
index 0000000000..049253d35c
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMASeq.cpp
@@ -0,0 +1,291 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfDoubleDMASeq.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include <cmath>
+#include <sstream>
+#include <string>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+#ifdef _WIN32
+const size_t blockX = 128;
+const size_t blockY = 128;
+const size_t blockZ = 256;
+#else
+const size_t blockX = 256;
+const size_t blockY = 256;
+const size_t blockZ = 512;
+#endif
+
+const size_t chunk = 16;
+const size_t size_S = blockX * blockY * blockZ * sizeof(cl_float4);
+const size_t size_s = blockX * blockY * chunk * sizeof(cl_float4);
+static const int WindowWidth = 80;
+
+const size_t MaxQueues = 3;
+
+static const char *strKernel =
+    "__kernel void dummy(__global float4* out)  \n"
+    "{                                          \n"
+    "   uint id = get_global_id(0);             \n"
+    "   float4 value = (float4)(1.0f, 2.0f, 3.0f, 4.0f);  \n"
+    "   uint factorial = 1;                     \n"
+    "   for (uint i = 1; i < (id / 0x400); ++i)\n"
+    "   {                                       \n"
+    "       factorial *= i;                     \n"
+    "   }                                       \n"
+    "   out[id] = value * factorial;            \n"
+    "}                                          \n";
+
+OCLPerfDoubleDMASeq::OCLPerfDoubleDMASeq() {
+  _numSubTests = MaxQueues * 2;
+  failed_ = false;
+}
+
+OCLPerfDoubleDMASeq::~OCLPerfDoubleDMASeq() {}
+
+void OCLPerfDoubleDMASeq::open(unsigned int test, char *units,
+                               double &conversion, unsigned int deviceId) {
+  _deviceId = deviceId;
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  test_ = test;
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    printf("GPU device is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+  kernel_ = _wrapper->clCreateKernel(program_, "dummy", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  size_t bufSize = size_s;
+  cl_mem buffer;
+  test_ %= MaxQueues;
+  events_ = ((test / MaxQueues) == 0) ? false : true;
+  size_t numBufs = (test_ % MaxQueues) + 1;
+  for (size_t b = 0; b < numBufs; ++b) {
+    buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, bufSize,
+                                      NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+    buffers_.push_back(buffer);
+  }
+
+  buffer = _wrapper->clCreateBuffer(context_,
+                                    CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                                    size_S, NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfDoubleDMASeq::run(void) {
+  if (failed_) {
+    return;
+  }
+  CPerfCounter timer;
+  const int numQueues = (test_ % MaxQueues) + 1;
+  const int numBufs = numQueues;
+
+  std::vector<cl_command_queue> cmdQueues(numQueues);
+  int q;
+  cl_command_queue_properties qProp = 0;
+  for (q = 0; q < numQueues; ++q) {
+    cl_command_queue cmdQueue = _wrapper->clCreateCommandQueue(
+        context_, devices_[_deviceId], qProp, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed");
+    cmdQueues[q] = cmdQueue;
+  }
+  CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "Execution failed");
+
+  float *Data_s = (float *)_wrapper->clEnqueueMapBuffer(
+      cmdQueues[0], buffers_[numBufs], CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
+      size_S, 0, NULL, NULL, &error_);
+
+  size_t gws[1] = {size_s / (4 * sizeof(float))};
+  size_t lws[1] = {256};
+
+  // Warm-up
+  for (q = 0; q < numQueues; ++q) {
+    error_ |=
+        _wrapper->clEnqueueWriteBuffer(cmdQueues[q], buffers_[q], CL_FALSE, 0,
+                                       size_s, (char *)Data_s, 0, NULL, NULL);
+    error_ |= _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                       (void *)&buffers_[q]);
+    error_ |= _wrapper->clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL,
+                                               gws, lws, 0, NULL, NULL);
+    error_ |=
+        _wrapper->clEnqueueReadBuffer(cmdQueues[q], buffers_[q], CL_FALSE, 0,
+                                      size_s, (char *)Data_s, 0, NULL, NULL);
+    error_ |= _wrapper->clFinish(cmdQueues[q]);
+  }
+
+  CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "Execution failed");
+
+  size_t s_done = 0;
+  cl_event x[MaxQueues] = {0};
+
+  /*----------  pass2:  copy Data_s to and from GPU Buffers ----------*/
+  s_done = 0;
+  timer.Reset();
+  timer.Start();
+  int idx = numBufs - 1;
+  // Start from the last so read/write won't go to the same DMA when kernel is
+  // executed
+  q = numQueues - 1;
+  size_t iter = 0;
+  if (events_) {
+    while (1) {
+      error_ |= _wrapper->clEnqueueWriteBuffer(
+          cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s,
+          (char *)Data_s + s_done, 0, NULL, NULL);
+
+      // Implicit flush of DMA engine on kernel start, because memory dependency
+      error_ |= _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                         (void *)&buffers_[idx]);
+      int prevQ;
+      if (q == 0) {
+        prevQ = numQueues - 1;
+      } else {
+        prevQ = q - 1;
+      }
+      if ((x[prevQ] != NULL) && (numQueues != 1)) {
+        error_ |= _wrapper->clEnqueueNDRangeKernel(
+            cmdQueues[q], kernel_, 1, NULL, gws, lws, 1, &x[prevQ], &x[q]);
+        error_ |= _wrapper->clReleaseEvent(x[prevQ]);
+        x[prevQ] = NULL;
+      } else {
+        error_ |= _wrapper->clEnqueueNDRangeKernel(
+            cmdQueues[q], kernel_, 1, NULL, gws, lws, 0, NULL, &x[q]);
+        if (numQueues == 1) {
+          error_ |= _wrapper->clReleaseEvent(x[q]);
+          x[q] = NULL;
+        }
+      }
+      error_ |= _wrapper->clFlush(cmdQueues[q]);
+
+      // Change the queue
+      error_ |= _wrapper->clEnqueueReadBuffer(
+          cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s,
+          (char *)Data_s + s_done, 0, NULL, NULL);
+
+      if ((s_done += size_s) >= size_S) {
+        break;
+      }
+
+      error_ |= _wrapper->clFlush(cmdQueues[q]);
+      ++iter;
+      ++idx %= numBufs;
+      ++q %= numQueues;
+    }
+    for (q = 0; q < numQueues; ++q) {
+      if (x[q] != NULL) {
+        error_ |= _wrapper->clReleaseEvent(x[q]);
+      }
+    }
+  } else {
+    while (1) {
+      error_ |= _wrapper->clEnqueueWriteBuffer(
+          cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s,
+          (char *)Data_s + s_done, 0, NULL, NULL);
+
+      // Implicit flush of DMA engine on kernel start, because memory dependency
+      error_ |= _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                         (void *)&buffers_[idx]);
+      error_ |= _wrapper->clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL,
+                                                 gws, lws, 0, NULL, NULL);
+
+      // Change the queue
+      error_ |= _wrapper->clEnqueueReadBuffer(
+          cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s,
+          (char *)Data_s + s_done, 0, NULL, NULL);
+
+      if ((s_done += size_s) >= size_S) {
+        break;
+      }
+
+      error_ |= _wrapper->clFlush(cmdQueues[q]);
+      ++iter;
+      ++idx %= numBufs;
+      ++q %= numQueues;
+    }
+  }
+
+  for (q = 0; q < numQueues; ++q) {
+    error_ |= _wrapper->clFinish(cmdQueues[q]);
+  }
+  timer.Stop();
+
+  error_ |= _wrapper->clEnqueueUnmapMemObject(cmdQueues[0], buffers_[numBufs],
+                                              Data_s, 0, NULL, NULL);
+
+  error_ |= _wrapper->clFinish(cmdQueues[0]);
+  CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "Execution failed");
+
+  for (q = 0; q < numQueues; ++q) {
+    error_ = _wrapper->clReleaseCommandQueue(cmdQueues[q]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseCommandQueue() failed");
+  }
+
+  double GBytes = (double)(2 * size_S) / (double)(1000 * 1000 * 1000);
+  _perfInfo = static_cast<float>(GBytes / timer.GetElapsedTime());
+
+  std::stringstream stream;
+  stream << "Write/Kernel/Read operation ";
+
+  stream << numQueues << " queues ";
+  if (events_) {
+    stream << " (use events) ";
+  }
+  stream << " [GB/s]";
+
+  stream.flags(std::ios::right | std::ios::showbase);
+  testDescString = stream.str();
+}
+
+unsigned int OCLPerfDoubleDMASeq::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMASeq.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMASeq.h
new file mode 100644
index 0000000000..7569233798
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMASeq.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PERF_DOUBLE_DMA_SEQ_H_
+#define _OCL_PERF_DOUBLE_DMA_SEQ_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfDoubleDMASeq : public OCLTestImp {
+ public:
+  OCLPerfDoubleDMASeq();
+  virtual ~OCLPerfDoubleDMASeq();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+  unsigned int test_;
+  bool events_;
+};
+
+#endif  // _OCL_PERF_DOUBLE_DMA_SEQ_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillBuffer.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillBuffer.cpp
new file mode 100644
index 0000000000..e090a768ad
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillBuffer.cpp
@@ -0,0 +1,114 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfFillBuffer.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include <sstream>
+#include <string>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+static size_t typeSizeList[] = {
+    1,  // sizeof(cl_uchar)
+    2,   4, 8, 16, 32, 64,
+    128,  // sizeof(cl_ulong16)
+};
+
+static unsigned int eleNumList[] = {
+    0x0020000, 0x0080000, 0x0200000, 0x0800000, 0x2000000,
+};
+
+OCLPerfFillBuffer::OCLPerfFillBuffer() {
+  num_typeSize_ = sizeof(typeSizeList) / sizeof(size_t);
+  num_elements_ = sizeof(eleNumList) / sizeof(unsigned int);
+  _numSubTests = num_elements_ * num_typeSize_;
+  failed_ = false;
+  skip_ = false;
+}
+
+OCLPerfFillBuffer::~OCLPerfFillBuffer() {}
+
+void OCLPerfFillBuffer::open(unsigned int test, char *units, double &conversion,
+                             unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  testTypeSize_ = typeSizeList[(test / num_elements_) % num_typeSize_];
+  testNumEle_ = eleNumList[test % num_elements_];
+
+  bufSize_ = testNumEle_ * 4;
+
+  buffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, bufSize_, 0,
+                                     &error_);
+  CHECK_RESULT(buffer_ == 0, "clCreateBuffer(buffer_) failed");
+
+  return;
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfFillBuffer::run(void) {
+  CPerfCounter timer;
+  size_t iter = 100;
+
+  void *data = malloc(testTypeSize_);
+
+  timer.Reset();
+  timer.Start();
+  for (size_t i = 0; i < iter; ++i) {
+    error_ = clEnqueueFillBuffer(cmdQueues_[_deviceId], buffer_, data,
+                                 testTypeSize_, 0, bufSize_, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueFillBuffer() failed");
+  }
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  timer.Stop();
+
+  char buf[256];
+
+  SNPRINTF(buf, sizeof(buf), "FillBuffer (GB/s) for %6d KB, typeSize:%3d",
+           (int)bufSize_ / 1024, (int)testTypeSize_);
+
+  testDescString = buf;
+  double sec = timer.GetElapsedTime();
+  _perfInfo = static_cast<float>((bufSize_ * iter * (double)(1e-09)) / sec);
+}
+
+unsigned int OCLPerfFillBuffer::close(void) {
+  if (buffer_) {
+    error_ = _wrapper->clReleaseMemObject(buffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(buffer) failed");
+  }
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillBuffer.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillBuffer.h
new file mode 100644
index 0000000000..afd6d0caea
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillBuffer.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PERF_FILL_BUFFER_H_
+#define _OCL_PERF_FILL_BUFFER_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfFillBuffer : public OCLTestImp {
+ public:
+  OCLPerfFillBuffer();
+  virtual ~OCLPerfFillBuffer();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  cl_mem buffer_;
+  unsigned int bufSize_;
+  unsigned int num_typeSize_;
+  unsigned int num_elements_;
+  size_t testTypeSize_;
+  unsigned int testNumEle_;
+  bool failed_;
+  bool skip_;
+};
+
+#endif  // _OCL_PERF_FILL_BUFFER_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillImage.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillImage.cpp
new file mode 100644
index 0000000000..7de92cc1a9
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillImage.cpp
@@ -0,0 +1,109 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfFillImage.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include <sstream>
+#include <string>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+static unsigned int sizeList[] = {
+    256, 512, 1024, 2048, 4096, 8192,
+};
+
+OCLPerfFillImage::OCLPerfFillImage() {
+  num_sizes_ = sizeof(sizeList) / sizeof(unsigned int);
+  _numSubTests = num_sizes_;
+  failed_ = false;
+  skip_ = false;
+}
+
+OCLPerfFillImage::~OCLPerfFillImage() {}
+
+void OCLPerfFillImage::open(unsigned int test, char *units, double &conversion,
+                            unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  bufSize_ = sizeList[test % num_sizes_];
+
+  cl_image_format format = {CL_RGBA, CL_UNSIGNED_INT8};
+  buffer_ = _wrapper->clCreateImage2D(context_, CL_MEM_WRITE_ONLY, &format,
+                                      bufSize_, bufSize_, 0, NULL, &error_);
+  CHECK_RESULT(buffer_ == 0, "clCreateImage2D(imageBuffer_) failed");
+
+  return;
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfFillImage::run(void) {
+  CPerfCounter timer;
+  size_t iter = 100;
+
+  cl_uint4 fillColor = {1, 1, 1, 1};
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {bufSize_, bufSize_, 1};
+
+  timer.Reset();
+  timer.Start();
+  for (size_t i = 0; i < iter; ++i) {
+    error_ = clEnqueueFillImage(cmdQueues_[_deviceId], buffer_,
+                                (const void *)&fillColor, origin, region, 0,
+                                NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueFillImage() failed");
+  }
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  timer.Stop();
+
+  char buf[256];
+
+  SNPRINTF(buf, sizeof(buf), "FillImage (GB/s) for %4dx%4d ", (int)bufSize_,
+           (int)bufSize_);
+
+  testDescString = buf;
+  double sec = timer.GetElapsedTime();
+  _perfInfo = static_cast<float>(
+      (bufSize_ * bufSize_ * 4 * iter * (double)(1e-09)) / sec);
+}
+
+unsigned int OCLPerfFillImage::close(void) {
+  if (buffer_) {
+    error_ = _wrapper->clReleaseMemObject(buffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(buffer) failed");
+  }
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillImage.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillImage.h
new file mode 100644
index 0000000000..5313e7941c
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillImage.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PERF_FILL_IMAGE_H_
+#define _OCL_PERF_FILL_IMAGE_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfFillImage : public OCLTestImp {
+ public:
+  OCLPerfFillImage();
+  virtual ~OCLPerfFillImage();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  cl_mem buffer_;
+  unsigned int bufSize_;
+  unsigned int num_sizes_;
+  bool failed_;
+  bool skip_;
+};
+
+#endif  // _OCL_PERF_FILL_IMAGE_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFlush.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFlush.cpp
new file mode 100644
index 0000000000..c38f73e01d
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFlush.cpp
@@ -0,0 +1,165 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfFlush.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include <sstream>
+#include <string>
+
+#include "CL/cl.h"
+
+static const cl_uint Iterations = 0x10000;
+static const cl_uint IterationDivider = 2;
+static const size_t MaxBuffers = IterationDivider;
+static size_t BufSize = 0x1000;
+
+const static char* strKernel =
+    "__kernel void factorial(__global uint* out)                        \n"
+    "{                                                                  \n"
+    "   uint id = get_global_id(0);                                     \n"
+    "   uint factorial = 1;                                             \n"
+    "   for (uint i = 1; i < (id / 0x10000); ++i)                       \n"
+    "   {                                                               \n"
+    "       factorial *= i;                                             \n"
+    "   }                                                               \n"
+    "    out[id] = factorial;                                            \n"
+    "}                                                                  \n";
+
+unsigned int NumTests = 3;
+
+OCLPerfFlush::OCLPerfFlush() {
+  _numSubTests = NumTests;
+  failed_ = false;
+}
+
+OCLPerfFlush::~OCLPerfFlush() {}
+
+void OCLPerfFlush::open(unsigned int test, char* units, double& conversion,
+                        unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  test_ = test;
+
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    printf("GPU device is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+  size_t maxWorkGroupSize = 1;
+  cl_uint computePower = 1;
+  error_ = _wrapper->clGetDeviceInfo(
+      devices_[deviceId], CL_DEVICE_MAX_WORK_GROUP_SIZE,
+      sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL);
+  computePower *= static_cast<cl_uint>(maxWorkGroupSize);
+  cl_uint maxComputeUnits = 1;
+  error_ = _wrapper->clGetDeviceInfo(
+      devices_[deviceId], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(maxComputeUnits),
+      &maxComputeUnits, NULL);
+  computePower *= 32 * maxComputeUnits;
+  BufSize = (BufSize < static_cast<size_t>(computePower))
+                ? static_cast<size_t>(computePower)
+                : BufSize;
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+  kernel_ = _wrapper->clCreateKernel(program_, "factorial", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+  for (size_t i = 0; i < MaxBuffers; ++i) {
+    buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                      BufSize * sizeof(cl_uint), NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+    buffers_.push_back(buffer);
+  }
+}
+
+void OCLPerfFlush::run(void) {
+  if (failed_) {
+    return;
+  }
+  for (size_t y = 0; y < IterationDivider; ++y) {
+    cl_mem buffer = buffers()[y];
+
+    error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+    size_t gws[1] = {BufSize};
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, gws, NULL, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  }
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  CPerfCounter timer;
+  const char* descriptions[] = {
+      "Single batch: ", "clFlush():    ", "clFinish():   "};
+
+  timer.Reset();
+  timer.Start();
+  cl_uint x;
+  for (x = 0; x < Iterations / IterationDivider; x++) {
+    for (size_t y = 0; y < IterationDivider; ++y) {
+      cl_mem buffer = buffers()[y];
+
+      error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+      size_t gws[1] = {BufSize};
+      error_ = _wrapper->clEnqueueNDRangeKernel(
+          cmdQueues_[_deviceId], kernel_, 1, NULL, gws, NULL, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+    }
+    if (test_ == 1) {
+      _wrapper->clFlush(cmdQueues_[_deviceId]);
+    } else if (test_ == 2) {
+      _wrapper->clFinish(cmdQueues_[_deviceId]);
+    }
+  }
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  timer.Stop();
+
+  std::stringstream stream;
+  stream << "Loop[" << std::hex << Iterations << "], " << descriptions[test_];
+  stream << "(sec)";
+  testDescString = stream.str();
+  _perfInfo = static_cast<float>(timer.GetElapsedTime());
+}
+
+unsigned int OCLPerfFlush::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFlush.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFlush.h
new file mode 100644
index 0000000000..06c71c7354
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFlush.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PERF_FLUSH_H_
+#define _OCL_PERF_FLUSH_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfFlush : public OCLTestImp {
+ public:
+  OCLPerfFlush();
+  virtual ~OCLPerfFlush();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+  unsigned int test_;
+};
+
+#endif  // _OCL_PERF_FLUSH_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.cpp
new file mode 100644
index 0000000000..2cc45d7e61
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.cpp
@@ -0,0 +1,309 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfGenericBandwidth.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 4
+// 256KB, 1 MB, 4MB, 16 MB
+static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304,
+                                              16777216};
+
+void OCLPerfGenericBandwidth::genShader(unsigned int idx) {
+  shader_.clear();
+  if (idx == 0) {
+    shader_ +=
+        "__kernel __attribute__((reqd_work_group_size(64,1,1))) void "
+        "_genericReadSpeed(global float *outBuf, global float *inBuf, local "
+        "float *inLocal, float c, char useLocal)\n"
+        "{\n"
+        "    int gid = (int) get_global_id(0);\n"
+        "    int lid = (int) get_local_id(0);\n"
+        "    float val0 = 0.0f;\n"
+        "    float val1 = 0.0f;\n"
+        "    float *localLocal;\n"
+        "    int hacklid = gid % 64;\n"
+        "    if (useLocal)\n"
+        "        localLocal = inLocal;\n"
+        "    else\n"
+        "        localLocal = inBuf;\n"
+        "    for (int i = 0; i < (768/64); i++) {\n"
+        "        localLocal[hacklid + i*64] = lid;\n"
+        "    }\n"
+        "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+        "#pragma nounroll\n"
+        "    for (uint i = 0; i < 32;i++)\n"
+        "    {\n"
+        "        val0 += localLocal[lid+0];\n"
+        "        val1 += localLocal[lid+64];\n"
+        "        val0 += localLocal[lid+128];\n"
+        "        val1 += localLocal[lid+192];\n"
+        "        val0 += localLocal[lid+256];\n"
+        "        val1 += localLocal[lid+320];\n"
+        "        val0 += localLocal[lid+384];\n"
+        "        val1 += localLocal[lid+448];\n"
+        "        lid += 1;\n"
+        "    }\n"
+        "    val0 += val1;\n"
+        "    val1 = min(val0,1.0f);\n"
+        "    if ((lid + val1) < 0){\n"
+        "        outBuf[gid] = val0;\n"
+        "    }\n"
+        "}\n";
+    dataSizeBytes_ = 768 * 4;
+  } else {
+    shader_ +=
+        "__kernel __attribute__((reqd_work_group_size(64,1,1))) void "
+        "_genericReadSpeed(global float *outBuf, global float *inBuf, local "
+        "float *inLocal, float c, char useLocal)\n"
+        "{\n"
+        "    uint gid = (uint) get_global_id(0);\n"
+        "    int lid = (int) get_local_id(0);\n"
+        "    float val0 = 0.0f;\n"
+        "    float val1 = 0.0f;\n"
+        "    float *localLocal;\n"
+        "    uint hacklid = gid % 64;\n"
+        "    if (useLocal)\n"
+        "        localLocal = inLocal;\n"
+        "    else\n"
+        "        localLocal = inBuf;\n"
+        "    for (int i = 0; i < (256/64); i++) {\n"
+        "        localLocal[hacklid + i*64] = lid;\n"
+        "    }\n"
+        "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+        "    #pragma nounroll\n"
+        "    for (uint i = 0; i < 32;i++)\n"
+        "    {\n"
+        "        val0 += localLocal[8*i+0];\n"
+        "        val1 += localLocal[8*i+1];\n"
+        "        val0 += localLocal[8*i+2];\n"
+        "        val1 += localLocal[8*i+3];\n"
+        "        val0 += localLocal[8*i+4];\n"
+        "        val1 += localLocal[8*i+5];\n"
+        "        val0 += localLocal[8*i+6];\n"
+        "        val1 += localLocal[8*i+7];\n"
+        "    }\n"
+        "    val0 += val1;\n"
+        "    val1 = min(val0,1.0f);\n"
+        "    if ((lid + val1) < 0){\n"
+        "        outBuf[gid] = val0;\n"
+        "    }\n"
+        "}\n";
+    dataSizeBytes_ = 256 * 4;
+  }
+}
+
+OCLPerfGenericBandwidth::OCLPerfGenericBandwidth() {
+  _numSubTests = NUM_SIZES * 4;
+}
+
+OCLPerfGenericBandwidth::~OCLPerfGenericBandwidth() {}
+
+void OCLPerfGenericBandwidth::setData(cl_mem buffer, float val) {
+  float *data = (float *)_wrapper->clEnqueueMapBuffer(
+      cmdQueues_[_deviceId], buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL,
+      NULL, &error_);
+  for (unsigned int i = 0; i < (bufSize_ >> 2); i++) data[i] = val;
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], buffer,
+                                             data, 0, NULL, NULL);
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+}
+
+void OCLPerfGenericBandwidth::checkData(cl_mem buffer) {
+  float *data = (float *)_wrapper->clEnqueueMapBuffer(
+      cmdQueues_[_deviceId], buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL,
+      NULL, &error_);
+  for (unsigned int i = 0; i < (bufSize_ >> 2); i++) {
+    if (data[i] != (float)numReads_) {
+      printf("Data validation failed at index %d!\n", i);
+      printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, numReads_,
+             numReads_, numReads_, (unsigned int)data[i],
+             (unsigned int)data[i + 1], (unsigned int)data[i + 2],
+             (unsigned int)data[i + 3]);
+      CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
+      break;
+    }
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], buffer,
+                                             data, 0, NULL, NULL);
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfGenericBandwidth::open(unsigned int test, char *units,
+                                   double &conversion, unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  _crcword = 0;
+  conversion = 1.0f;
+
+  failed = false;
+  kernel_ = 0;
+  inBuffer_ = 0;
+  outBuffer_ = 0;
+  useLDS_ = ((test / NUM_SIZES) % 2) == 0 ? 1 : 0;
+
+  size_t param_size = 0;
+  char *strVersion = 0;
+  error_ = _wrapper->clGetDeviceInfo(
+      devices_[_deviceId], CL_DEVICE_OPENCL_C_VERSION, 0, 0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strVersion = new char[param_size];
+  error_ =
+      _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_OPENCL_C_VERSION,
+                                param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strVersion[9] < '2') {
+    failed = true;
+    return;
+  }
+  delete strVersion;
+
+  numReads_ = 32;
+  width_ = Sizes[test % NUM_SIZES];
+  shaderIdx_ = test / (NUM_SIZES * 2);
+
+  bufSize_ = width_;
+
+  inBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+  CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed");
+
+  outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  genShader(shaderIdx_);
+  char *tmp = (char *)shader_.c_str();
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&tmp, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError = _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                               CL_PROGRAM_BUILD_LOG,
+                                               16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "_genericReadSpeed", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  float foo = 0;
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer_);
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&inBuffer_);
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, 1024 * sizeof(cl_float),
+                                    (void *)NULL);
+  error_ = _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_float), (void *)&foo);
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_char), (void *)&useLDS_);
+
+  setData(outBuffer_, 1.2345678f);
+}
+
+void OCLPerfGenericBandwidth::run(void) {
+  if (failed) return;
+  int global = bufSize_ / sizeof(cl_float);
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  CPerfCounter timer;
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < NUM_ITER; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmdQueues_[_deviceId], kernel_, 1, NULL,
+        (const size_t *)global_work_size, (const size_t *)local_work_size, 0,
+        NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  char buf[256];
+  const char *buf2;
+  if (useLDS_)
+    buf2 = "LDS";
+  else
+    buf2 = "global";
+  const char *buf3;
+  if (shaderIdx_ == 0) {
+    buf3 = "reads";
+    numReads_ *= 8;
+  } else {
+    buf3 = "broadcast";
+    numReads_ *= 8;
+  }
+  // LDS bandwidth in GB/s
+  // We have one extra write per LDS location to initialize LDS
+  double perf =
+      ((double)global * (numReads_ * sizeof(cl_float) + dataSizeBytes_ / 64) *
+       NUM_ITER * (double)(1e-09)) /
+      sec;
+
+  _perfInfo = (float)perf;
+  SNPRINTF(buf, sizeof(buf), " %6s %9s %8d threads, %3d reads (GB/s) ", buf2,
+           buf3, global, numReads_);
+  testDescString = buf;
+  // checkData(outBuffer_);
+}
+
+unsigned int OCLPerfGenericBandwidth::close(void) {
+  if (inBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(inBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(inBuffer_) failed");
+  }
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.h
new file mode 100644
index 0000000000..6898fc0f88
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GenericBandwidth_H_
+#define _OCL_GenericBandwidth_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfGenericBandwidth : public OCLTestImp {
+ public:
+  OCLPerfGenericBandwidth();
+  virtual ~OCLPerfGenericBandwidth();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void genShader(unsigned int idx);
+  void setData(cl_mem buffer, float data);
+  void checkData(cl_mem buffer);
+
+  static const unsigned int NUM_ITER = 100;
+
+  cl_mem inBuffer_;
+  cl_mem outBuffer_;
+
+  unsigned int width_;
+  unsigned int bufSize_;
+  unsigned int vecSizeIdx_;
+  unsigned int numReads_;
+  unsigned int shaderIdx_;
+  unsigned int dataSizeBytes_;
+  cl_char useLDS_;
+  bool failed;
+};
+
+#endif  // _OCL_GenericBandwidth_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenoilSiaMiner.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenoilSiaMiner.cpp
new file mode 100644
index 0000000000..0c92f2d638
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenoilSiaMiner.cpp
@@ -0,0 +1,429 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfGenoilSiaMiner.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <complex>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_INTENSITY 15
+
+static const unsigned int intensities[NUM_INTENSITY] = {
+    DEFAULT_INTENSITY, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31};
+
+static const char *siaKernel =
+    "   inline static uint2 ror64(const uint2 x, const uint y)                 "
+    "                                   \n"
+    "   {                                                                      "
+    "                                   \n"
+    "       return "
+    "(uint2)(((x).x>>y)^((x).y<<(32-y)),((x).y>>y)^((x).x<<(32-y)));           "
+    "                     \n"
+    "   }                                                                      "
+    "                                   \n"
+    "   inline static uint2 ror64_2(const uint2 x, const uint y)               "
+    "                                   \n"
+    "   {                                                                      "
+    "                                   \n"
+    "       return "
+    "(uint2)(((x).y>>(y-32))^((x).x<<(64-y)),((x).x>>(y-32))^((x).y<<(64-y))); "
+    "                     \n"
+    "   }                                                                      "
+    "                                   \n"
+    "   __constant static const uchar blake2b_sigma[12][16] = {                "
+    "                                   \n"
+    "       { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } "
+    ",                                  \n"
+    "       { 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  } "
+    ",                                  \n"
+    "       { 11, 8,  12, 0,  5,  2,  15, 13, 10, 14, 3,  6,  7,  1,  9,  4  } "
+    ",                                  \n"
+    "       { 7,  9,  3,  1,  13, 12, 11, 14, 2,  6,  5,  10, 4,  0,  15, 8  } "
+    ",                                  \n"
+    "       { 9,  0,  5,  7,  2,  4,  10, 15, 14, 1,  11, 12, 6,  8,  3,  13 } "
+    ",                                  \n"
+    "       { 2,  12, 6,  10, 0,  11, 8,  3,  4,  13, 7,  5,  15, 14, 1,  9  } "
+    ",                                  \n"
+    "       { 12, 5,  1,  15, 14, 13, 4,  10, 0,  7,  6,  3,  9,  2,  8,  11 } "
+    ",                                  \n"
+    "       { 13, 11, 7,  14, 12, 1,  3,  9,  5,  0,  15, 4,  8,  6,  2,  10 } "
+    ",                                  \n"
+    "       { 6,  15, 14, 9,  11, 3,  0,  8,  12, 2,  13, 7,  1,  4,  10, 5  } "
+    ",                                  \n"
+    "       { 10, 2,  8,  4,  7,  6,  1,  5,  15, 11, 9,  14, 3,  12, 13, 0  } "
+    ",                                  \n"
+    "       { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } "
+    ",                                  \n"
+    "       { 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  } "
+    "};                                 \n"
+    "   // Target is passed in via headerIn[32 - 29]                           "
+    "                                   \n"
+    "   __kernel void nonceGrind(__global ulong *headerIn, __global ulong "
+    "*nonceOut) {                            \n"
+    "       ulong target = headerIn[4];                                        "
+    "                                   \n"
+    "       ulong m[16] = {    headerIn[0], headerIn[1],                       "
+    "                                   \n"
+    "                       headerIn[2], headerIn[3],                          "
+    "                                   \n"
+    "                       (ulong)get_global_id(0), headerIn[5],              "
+    "                                   \n"
+    "                       headerIn[6], headerIn[7],                          "
+    "                                   \n"
+    "                       headerIn[8], headerIn[9], 0, 0, 0, 0, 0, 0 };      "
+    "                                   \n"
+    "       ulong v[16] = { 0x6a09e667f2bdc928, 0xbb67ae8584caa73b, "
+    "0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,       \n"
+    "                       0x510e527fade682d1, 0x9b05688c2b3e6c1f, "
+    "0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,       \n"
+    "                       0x6a09e667f3bcc908, 0xbb67ae8584caa73b, "
+    "0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,       \n"
+    "                       0x510e527fade68281, 0x9b05688c2b3e6c1f, "
+    "0xe07c265404be4294, 0x5be0cd19137e2179 };     \n"
+    "   #define G(r,i,a,b,c,d) \\\n"
+    "       a = a + b + m[ blake2b_sigma[r][2*i] ]; \\\n"
+    "       ((uint2*)&d)[0] = ((uint2*)&d)[0].yx ^ ((uint2*)&a)[0].yx; \\\n"
+    "       c = c + d; \\\n"
+    "       ((uint2*)&b)[0] = ror64( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 24U); "
+    "\\\n"
+    "       a = a + b + m[ blake2b_sigma[r][2*i+1] ]; \\\n"
+    "       ((uint2*)&d)[0] = ror64( ((uint2*)&d)[0] ^ ((uint2*)&a)[0], 16U); "
+    "\\\n"
+    "       c = c + d; \\\n"
+    "       ((uint2*)&b)[0] = ror64_2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], "
+    "63U);\n"
+    "   #define ROUND(r)                    \\\n"
+    "       G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \\\n"
+    "       G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \\\n"
+    "       G(r,2,v[ 2],v[ 6],v[10],v[14]); \\\n"
+    "       G(r,3,v[ 3],v[ 7],v[11],v[15]); \\\n"
+    "       G(r,4,v[ 0],v[ 5],v[10],v[15]); \\\n"
+    "       G(r,5,v[ 1],v[ 6],v[11],v[12]); \\\n"
+    "       G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \\\n"
+    "       G(r,7,v[ 3],v[ 4],v[ 9],v[14]);                                    "
+    "                                   \n"
+    "       ROUND( 0 );                                                        "
+    "                                   \n"
+    "       ROUND( 1 );                                                        "
+    "                                   \n"
+    "       ROUND( 2 );                                                        "
+    "                                   \n"
+    "       ROUND( 3 );                                                        "
+    "                                   \n"
+    "       ROUND( 4 );                                                        "
+    "                                   \n"
+    "       ROUND( 5 );                                                        "
+    "                                   \n"
+    "       ROUND( 6 );                                                        "
+    "                                   \n"
+    "       ROUND( 7 );                                                        "
+    "                                   \n"
+    "       ROUND( 8 );                                                        "
+    "                                   \n"
+    "       ROUND( 9 );                                                        "
+    "                                   \n"
+    "       ROUND( 10 );                                                       "
+    "                                   \n"
+    "       ROUND( 11 );                                                       "
+    "                                   \n"
+    "   #undef G                                                               "
+    "                                   \n"
+    "   #undef ROUND                                                           "
+    "                                   \n"
+    "       if (as_ulong(as_uchar8(0x6a09e667f2bdc928 ^ v[0] ^ "
+    "v[8]).s76543210) < target) {                       \n"
+    "           *nonceOut = m[4];                                              "
+    "                                   \n"
+    "           return;                                                        "
+    "                                   \n"
+    "       }                                                                  "
+    "                                   \n"
+    "   }\n";
+
+OCLPerfGenoilSiaMiner::OCLPerfGenoilSiaMiner() { _numSubTests = NUM_INTENSITY; }
+
+OCLPerfGenoilSiaMiner::~OCLPerfGenoilSiaMiner() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfGenoilSiaMiner::setHeader(uint32_t *ptr) {
+  ptr[0] = 0x10;
+  for (unsigned int i = 1; i < 9; i++) {
+    ptr[i] = 0;
+  }
+  ptr[9] = 0x4a5e1e4b;
+  ptr[10] = 0xaab89f3a;
+  ptr[11] = 0x32518a88;
+  ptr[12] = 0xc31bc87f;
+  ptr[13] = 0x618f7667;
+  ptr[14] = 0x3e2cc77a;
+  ptr[15] = 0xb2127b7a;
+  ptr[16] = 0xfdeda33b;
+  ptr[17] = 0x495fab29;
+  ptr[18] = 0x1d00ffff;
+  ptr[19] = 0x7c2bac1d;
+}
+
+void OCLPerfGenoilSiaMiner::open(unsigned int test, char *units,
+                                 double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  // Parse args.
+  isAMD = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        isAMD = true;
+      }
+      // platform = platforms[_platformIndex];
+      // break;
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+
+  char getVersion[128];
+  error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION,
+                                       sizeof(getVersion), getVersion, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+  platformVersion[0] = getVersion[7];
+  platformVersion[1] = getVersion[8];
+  platformVersion[2] = getVersion[9];
+  platformVersion[3] = '\0';
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  // Make sure the device can handle our local item size.
+  size_t max_group_size = 0;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                                     sizeof(size_t), &max_group_size, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+  if (local_item_size > max_group_size) {
+    char buf[256];
+    SNPRINTF(buf, sizeof(buf),
+             "Selected device cannot handle work groups larger than %zu.\n",
+             local_item_size);
+    local_item_size = max_group_size;
+    testDescString = buf;
+  }
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  // Create Buffer Objects.
+  blockHeadermobj_ = _wrapper->clCreateBuffer(
+      context_, CL_MEM_READ_ONLY, 80 * sizeof(uint8_t), NULL, &error_);
+  CHECK_RESULT(blockHeadermobj_ == 0, "clCreateBuffer(outBuffer) failed");
+  nonceOutmobj_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                           8 * sizeof(uint8_t), NULL, &error_);
+  CHECK_RESULT(nonceOutmobj_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  // Create kernel program from source file.
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&siaKernel, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &device, NULL, NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  // Create data parallel OpenCL kernel.
+  kernel_ = _wrapper->clCreateKernel(program_, "nonceGrind", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  // Set OpenCL kernel arguments.
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                    (void *)&blockHeadermobj_);
+  error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem),
+                                    (void *)&nonceOutmobj_);
+}
+
+void OCLPerfGenoilSiaMiner::run(void) {
+  CPerfCounter timer;
+
+  uint8_t blockHeader[80];
+  uint8_t target[32] = {255};
+  uint8_t nonceOut[8] = {0};
+
+  setHeader((uint32_t *)blockHeader);
+  intensity = intensities[_openTest % NUM_INTENSITY];
+  size_t global_item_size = 1ULL << intensity;
+
+  timer.Reset();
+  timer.Start();
+
+  // By doing a bunch of low intensity calls, we prevent freezing
+  // By splitting them up inside this function, we also avoid calling
+  // get_block_for_work too often.
+  for (unsigned int i = 0; i < cycles_per_iter; i++) {
+    // Offset global ids so that each loop call tries a different set of
+    // hashes.
+    size_t globalid_offset = i * global_item_size;
+
+    // Copy input data to the memory buffer.
+    error_ =
+        clEnqueueWriteBuffer(cmd_queue_, blockHeadermobj_, CL_TRUE, 0,
+                             80 * sizeof(uint8_t), blockHeader, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueWriteBuffer failed");
+
+    error_ = clEnqueueWriteBuffer(cmd_queue_, nonceOutmobj_, CL_TRUE, 0,
+                                  8 * sizeof(uint8_t), nonceOut, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueWriteBuffer failed");
+
+    // Run the kernel.
+    error_ = clEnqueueNDRangeKernel(cmd_queue_, kernel_, 1, &globalid_offset,
+                                    &global_item_size, &local_item_size, 0,
+                                    NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+
+    // Copy result to host and see if a block was found.
+    error_ = clEnqueueReadBuffer(cmd_queue_, nonceOutmobj_, CL_TRUE, 0,
+                                 8 * sizeof(uint8_t), nonceOut, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueReadBuffer failed");
+
+    // if (nonceOut[0] != 0) {
+    //    // Copy nonce to header.
+    //    memcpy(blockHeader + 32, nonceOut, 8);
+    //    break;
+    //}
+  }
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Hash rate calculation MH/s
+  double hash_rate = cycles_per_iter * global_item_size / (sec * 1000000);
+
+  _perfInfo = (float)hash_rate;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf),
+           " (%4d cycles) Work_items:%10zu Intensity:%d (MH/s) ",
+           cycles_per_iter, global_item_size, intensity);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfGenoilSiaMiner::close(void) {
+  if (blockHeadermobj_) {
+    error_ = _wrapper->clReleaseMemObject(blockHeadermobj_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(blockHeadermobj_) failed");
+  }
+  if (nonceOutmobj_) {
+    error_ = _wrapper->clReleaseMemObject(nonceOutmobj_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(nonceOutmobj_) failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenoilSiaMiner.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenoilSiaMiner.h
new file mode 100644
index 0000000000..0d2f77b454
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenoilSiaMiner.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GenoilSiaMiner_H_
+#define _OCL_GenoilSiaMiner_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfGenoilSiaMiner : public OCLTestImp {
+ public:
+  OCLPerfGenoilSiaMiner();
+  virtual ~OCLPerfGenoilSiaMiner();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 1000;
+  // 2^intensity hashes are calculated each time the kernel is called
+  // Minimum of 2^8 (256) because our default local_item_size is 256
+  // global_item_size (2^intensity) must be a multiple of local_item_size
+  // Max of 2^32 so that people can't send an hour of work to the GPU at one
+  // time
+#define MIN_INTENSITY 8
+#define MAX_INTENSITY 32
+#define DEFAULT_INTENSITY 16
+
+  // Number of times the GPU kernel is called between updating the command line
+  // text
+#define MIN_CPI 1  // Must do one call per update
+#define MAX_CPI 65536  // 2^16 is a slightly arbitrary max
+#define DEFAULT_CPI 30
+
+  // The maximum size of the .cl file we read in and compile
+#define MAX_SOURCE_SIZE (0x200000)
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_int error_;
+  cl_program program_;
+  cl_kernel kernel_;
+
+  // mem objects for storing our kernel parameters
+  cl_mem blockHeadermobj_ = NULL;
+  cl_mem nonceOutmobj_ = NULL;
+
+  // More gobal variables the grindNonce needs to access
+  size_t local_item_size =
+      256;  // Size of local work groups. 256 is usually optimal
+  unsigned int blocks_mined = 0;
+  unsigned int intensity = DEFAULT_INTENSITY;
+  unsigned cycles_per_iter = DEFAULT_CPI;
+
+  bool isAMD;
+  char platformVersion[32];
+  void setHeader(uint32_t* ptr);
+};
+
+#endif  // _OCL_GenoilSiaMiner_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopyCorners.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopyCorners.cpp
new file mode 100644
index 0000000000..f8f3280441
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopyCorners.cpp
@@ -0,0 +1,367 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfImageCopyCorners.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 2
+static const unsigned int Sizes0[NUM_SIZES] = {512, 16384};
+static const unsigned int Sizes1[NUM_SIZES] = {16384, 512};
+
+#define NUM_FORMATS 3
+static const cl_image_format formats[NUM_FORMATS] = {
+    {CL_RGBA, CL_UNSIGNED_INT8},
+    {CL_R, CL_UNSIGNED_INT32},
+    {CL_RGBA, CL_UNSIGNED_INT32}};
+static const char *textFormats[NUM_FORMATS] = {"R8G8B8A8", "R32",
+                                               "R32G32B32A32"};
+static const unsigned int formatSize[NUM_FORMATS] = {
+    4 * sizeof(cl_uchar), 1 * sizeof(cl_uint), 4 * sizeof(cl_uint)};
+
+static const unsigned int Iterations[2] = {1,
+                                           OCLPerfImageCopyCorners::NUM_ITER};
+
+#define NUM_SUBTESTS 3
+OCLPerfImageCopyCorners::OCLPerfImageCopyCorners() {
+  _numSubTests = NUM_SIZES * NUM_SUBTESTS * NUM_FORMATS * 2;
+}
+
+OCLPerfImageCopyCorners::~OCLPerfImageCopyCorners() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfImageCopyCorners::setData(void *ptr, unsigned int pitch,
+                                      unsigned int size) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  unsigned int value = 0;
+  for (unsigned int i = 0; i <size >> 2; i++) {
+    ptr2[i] = value;
+    value++;
+  }
+}
+
+void OCLPerfImageCopyCorners::checkData(void *ptr, unsigned int pitch,
+                                        unsigned int size) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  unsigned int value = 0;
+  for (unsigned int i = 0; i < size >> 2; i++) {
+    if (ptr2[i] != value) {
+      printf("Data validation failed at %d!  Got 0x%08x 0x%08x 0x%08x 0x%08x\n",
+             i, ptr2[i], ptr2[i + 1], ptr2[i + 2], ptr2[i + 3]);
+      printf("Expected 0x%08x 0x%08x 0x%08x 0x%08x\n", value, value, value,
+             value);
+      CHECK_RESULT(true, "Data validation failed!");
+      break;
+    }
+    value++;
+  }
+}
+
+void OCLPerfImageCopyCorners::open(unsigned int test, char *units,
+                                   double &conversion, unsigned int deviceId) {
+  cl_uint typeOfDevice = type_;
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  size_t queryOut = 0;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  srcBuffer_ = 0;
+  dstBuffer_ = 0;
+  srcImage_ = false;
+  dstImage_ = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice,
+                                      0, NULL, &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    // if (num_devices > 0)
+    //{
+    //    platform = platforms[_platformIndex];
+    //    break;
+    //}
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+
+  bufnum_ = (_openTest / (NUM_SIZES * NUM_SUBTESTS)) % NUM_FORMATS;
+
+  if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) + 1) & 1) {
+    srcImage_ = true;
+  }
+  if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) + 1) & 2) {
+    dstImage_ = true;
+  }
+
+  numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS * NUM_FORMATS)];
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices,
+                                    devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  if (_openTest % NUM_SIZES) {
+    error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH,
+                                       sizeof(size_t), &queryOut, NULL);
+    bufSizeW_ = (cl_uint)queryOut;
+    bufSizeH_ = Sizes1[_openTest % NUM_SIZES];
+  } else {
+    error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT,
+                                       sizeof(size_t), &queryOut, NULL);
+    bufSizeW_ = Sizes0[_openTest % NUM_SIZES];
+    bufSizeH_ = (cl_uint)queryOut;
+  }
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_WRITE_ONLY;
+  void *mem;
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {bufSizeW_, bufSizeH_, 1};
+  size_t image_row_pitch;
+  size_t image_slice_pitch;
+  unsigned int memSize;
+  if (dstImage_) {
+    dstBuffer_ =
+        _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], bufSizeW_,
+                                  bufSizeH_, 0, NULL, &error_);
+    CHECK_RESULT(dstBuffer_ == 0, "clCreateImage(dstBuffer) failed");
+    mem = _wrapper->clEnqueueMapImage(
+        cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region,
+        &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapImage failed");
+    memSize = (unsigned int)image_row_pitch * bufSizeH_;
+  } else {
+    dstBuffer_ = _wrapper->clCreateBuffer(
+        context_, flags, bufSizeW_ * bufSizeH_ * formatSize[bufnum_], NULL,
+        &error_);
+    CHECK_RESULT(dstBuffer_ == 0, "clCreateBuffer(dstBuffer) failed");
+    mem = _wrapper->clEnqueueMapBuffer(
+        cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_WRITE, 0,
+        bufSizeW_ * bufSizeH_ * formatSize[bufnum_], 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+    memSize = (unsigned int)bufSizeW_ * bufSizeH_ * formatSize[bufnum_];
+    image_row_pitch = 0;
+  }
+  setData(mem, (unsigned int)image_row_pitch, memSize);
+  _wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL);
+
+  flags = CL_MEM_READ_ONLY;
+  if (srcImage_) {
+    srcBuffer_ =
+        _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], bufSizeW_,
+                                  bufSizeH_, 0, NULL, &error_);
+    CHECK_RESULT(srcBuffer_ == 0, "clCreateImage(srcBuffer) failed");
+    mem = _wrapper->clEnqueueMapImage(
+        cmd_queue_, srcBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region,
+        &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapImage failed");
+    memSize = (unsigned int)image_row_pitch * bufSizeH_;
+  } else {
+    srcBuffer_ = _wrapper->clCreateBuffer(
+        context_, flags, bufSizeW_ * bufSizeH_ * formatSize[bufnum_], NULL,
+        &error_);
+    CHECK_RESULT(srcBuffer_ == 0, "clCreateBuffer(srcBuffer) failed");
+    mem = _wrapper->clEnqueueMapBuffer(
+        cmd_queue_, srcBuffer_, CL_TRUE, CL_MAP_WRITE, 0,
+        bufSizeW_ * bufSizeH_ * formatSize[bufnum_], 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+    memSize = (unsigned int)bufSizeW_ * bufSizeH_ * formatSize[bufnum_];
+    image_row_pitch = 0;
+  }
+  setData(mem, (unsigned int)image_row_pitch, memSize);
+  _wrapper->clEnqueueUnmapMemObject(cmd_queue_, srcBuffer_, mem, 0, NULL, NULL);
+}
+
+void OCLPerfImageCopyCorners::run(void) {
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {bufSizeW_, bufSizeH_, 1};
+
+  // Warm up
+  if (srcImage_ == false) {
+    error_ = _wrapper->clEnqueueCopyBufferToImage(
+        cmd_queue_, srcBuffer_, dstBuffer_, 0, origin, region, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueCopyBufferToImage failed");
+  } else if (dstImage_ == false) {
+    error_ = _wrapper->clEnqueueCopyImageToBuffer(
+        cmd_queue_, srcBuffer_, dstBuffer_, origin, region, 0, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueCopyImageToBuffer failed");
+  } else {
+    error_ =
+        _wrapper->clEnqueueCopyImage(cmd_queue_, srcBuffer_, dstBuffer_, origin,
+                                     origin, region, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueCopyImage failed");
+  }
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  CPerfCounter timer;
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < numIter; i++) {
+    if (srcImage_ == false) {
+      error_ = _wrapper->clEnqueueCopyBufferToImage(
+          cmd_queue_, srcBuffer_, dstBuffer_, 0, origin, region, 0, NULL, NULL);
+      CHECK_RESULT(error_, "clEnqueueCopyBufferToImage failed");
+    } else if (dstImage_ == false) {
+      error_ = _wrapper->clEnqueueCopyImageToBuffer(
+          cmd_queue_, srcBuffer_, dstBuffer_, origin, region, 0, 0, NULL, NULL);
+      CHECK_RESULT(error_, "clEnqueueCopyImageToBuffer failed");
+    } else {
+      error_ =
+          _wrapper->clEnqueueCopyImage(cmd_queue_, srcBuffer_, dstBuffer_,
+                                       origin, origin, region, 0, NULL, NULL);
+      CHECK_RESULT(error_, "clEnqueueCopyImage failed");
+    }
+  }
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Image copy bandwidth in GB/s
+  double perf = ((double)bufSizeW_ * bufSizeH_ * formatSize[bufnum_] * 2 *
+                 numIter * (double)(1e-09)) /
+                sec;
+
+  const char *strSrc = NULL;
+  const char *strDst = NULL;
+  if (srcImage_)
+    strSrc = "img";
+  else
+    strSrc = "buf";
+  if (dstImage_)
+    strDst = "img";
+  else
+    strDst = "buf";
+  void *mem;
+  size_t image_row_pitch;
+  size_t image_slice_pitch;
+  unsigned int memSize;
+  if (dstImage_) {
+    mem = _wrapper->clEnqueueMapImage(
+        cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ, origin, region,
+        &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapImage failed");
+    memSize = (unsigned int)image_row_pitch * bufSizeH_;
+  } else {
+    mem = _wrapper->clEnqueueMapBuffer(
+        cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ, 0,
+        bufSizeW_ * bufSizeH_ * formatSize[bufnum_], 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+    memSize = (unsigned int)bufSizeW_ * bufSizeH_ * formatSize[bufnum_];
+    image_row_pitch = 0;
+  }
+  checkData(mem, (unsigned int)image_row_pitch, memSize);
+  _wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL);
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s src:%s dst:%s i: %4d (GB/s) ",
+           bufSizeW_, bufSizeH_, textFormats[bufnum_], strSrc, strDst, numIter);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfImageCopyCorners::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+
+  if (srcBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(srcBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(srcBuffer_) failed");
+  }
+  if (dstBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(dstBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(dstBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopyCorners.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopyCorners.h
new file mode 100644
index 0000000000..7d761c8e13
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopyCorners.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_ImageCopyCorners_H_
+#define _OCL_ImageCopyCorners_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfImageCopyCorners : public OCLTestImp {
+ public:
+  OCLPerfImageCopyCorners();
+  virtual ~OCLPerfImageCopyCorners();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 10;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem srcBuffer_;
+  cl_mem dstBuffer_;
+  cl_int error_;
+
+  unsigned int bufSizeW_;
+  unsigned int bufSizeH_;
+  unsigned int bufnum_;
+  bool srcImage_;
+  bool dstImage_;
+  unsigned int numIter;
+  void setData(void* ptr, unsigned int pitch, unsigned int size);
+  void checkData(void* ptr, unsigned int pitch, unsigned int size);
+};
+
+#endif  // _OCL_ImageCopyCorners_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopySpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopySpeed.cpp
new file mode 100644
index 0000000000..5d62de9dad
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopySpeed.cpp
@@ -0,0 +1,344 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfImageCopySpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 4
+static const unsigned int Sizes[NUM_SIZES] = {256, 512, 1024, 2048};
+
+#define NUM_FORMATS 1
+static const cl_image_format formats[NUM_FORMATS] = {
+    {CL_RGBA, CL_UNSIGNED_INT8}};
+static const char *textFormats[NUM_FORMATS] = {"R8G8B8A8"};
+static const unsigned int formatSize[NUM_FORMATS] = {4 * sizeof(cl_uchar)};
+
+static const unsigned int Iterations[2] = {1, OCLPerfImageCopySpeed::NUM_ITER};
+
+#define NUM_SUBTESTS 3
+OCLPerfImageCopySpeed::OCLPerfImageCopySpeed() {
+  _numSubTests = NUM_SIZES * NUM_SUBTESTS * NUM_FORMATS * 2;
+}
+
+OCLPerfImageCopySpeed::~OCLPerfImageCopySpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfImageCopySpeed::setData(void *ptr, unsigned int pitch,
+                                    unsigned int size, unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  for (unsigned int i = 0; i < size >> 2; i++) {
+    ptr2[i] = value;
+  }
+}
+
+void OCLPerfImageCopySpeed::checkData(void *ptr, unsigned int pitch,
+                                      unsigned int size, unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  for (unsigned int i = 0; i < size >> 2; i++) {
+    if (ptr2[i] != value) {
+      printf("Data validation failed at %d!  Got 0x%08x 0x%08x 0x%08x 0x%08x\n",
+             i, ptr2[i], ptr2[i + 1], ptr2[i + 2], ptr2[i + 3]);
+      printf("Expected 0x%08x 0x%08x 0x%08x 0x%08x\n", value, value, value,
+             value);
+      break;
+    }
+  }
+}
+
+void OCLPerfImageCopySpeed::open(unsigned int test, char *units,
+                                 double &conversion, unsigned int deviceId) {
+  cl_uint typeOfDevice = type_;
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  srcBuffer_ = 0;
+  dstBuffer_ = 0;
+  srcImage_ = false;
+  dstImage_ = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice,
+                                      0, NULL, &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    // if (num_devices > 0)
+    //{
+    //    platform = platforms[_platformIndex];
+    //    break;
+    //}
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+
+  bufSize_ = Sizes[_openTest % NUM_SIZES];
+  bufnum_ = (_openTest / (NUM_SIZES * NUM_SUBTESTS)) % NUM_FORMATS;
+
+  if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) + 1) & 1) {
+    srcImage_ = true;
+  }
+  if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) + 1) & 2) {
+    dstImage_ = true;
+  }
+
+  numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS * NUM_FORMATS)];
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices,
+                                    devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_WRITE_ONLY;
+  void *mem;
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {bufSize_, bufSize_, 1};
+  size_t image_row_pitch;
+  size_t image_slice_pitch;
+  unsigned int memSize;
+  if (dstImage_) {
+    dstBuffer_ =
+        _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], bufSize_,
+                                  bufSize_, 0, NULL, &error_);
+    CHECK_RESULT(dstBuffer_ == 0, "clCreateImage(dstBuffer) failed");
+    mem = _wrapper->clEnqueueMapImage(
+        cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region,
+        &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapImage failed");
+    memSize = (unsigned int)image_row_pitch * bufSize_;
+  } else {
+    dstBuffer_ = _wrapper->clCreateBuffer(
+        context_, flags, bufSize_ * bufSize_ * formatSize[bufnum_], NULL,
+        &error_);
+    CHECK_RESULT(dstBuffer_ == 0, "clCreateBuffer(dstBuffer) failed");
+    mem = _wrapper->clEnqueueMapBuffer(
+        cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_WRITE, 0,
+        bufSize_ * bufSize_ * formatSize[bufnum_], 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+    memSize = (unsigned int)bufSize_ * bufSize_ * formatSize[bufnum_];
+    image_row_pitch = 0;
+  }
+  setData(mem, (unsigned int)image_row_pitch, memSize, 0xdeadbeef);
+  _wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL);
+
+  flags = CL_MEM_READ_ONLY;
+  if (srcImage_) {
+    srcBuffer_ =
+        _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], bufSize_,
+                                  bufSize_, 0, NULL, &error_);
+    CHECK_RESULT(srcBuffer_ == 0, "clCreateImage(srcBuffer) failed");
+    mem = _wrapper->clEnqueueMapImage(
+        cmd_queue_, srcBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region,
+        &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapImage failed");
+    memSize = (unsigned int)image_row_pitch * bufSize_;
+  } else {
+    srcBuffer_ = _wrapper->clCreateBuffer(
+        context_, flags, bufSize_ * bufSize_ * formatSize[bufnum_], NULL,
+        &error_);
+    CHECK_RESULT(srcBuffer_ == 0, "clCreateBuffer(srcBuffer) failed");
+    mem = _wrapper->clEnqueueMapBuffer(
+        cmd_queue_, srcBuffer_, CL_TRUE, CL_MAP_WRITE, 0,
+        bufSize_ * bufSize_ * formatSize[bufnum_], 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+    memSize = (unsigned int)bufSize_ * bufSize_ * formatSize[bufnum_];
+    image_row_pitch = 0;
+  }
+  setData(mem, (unsigned int)image_row_pitch, memSize, 0x600df00d);
+  _wrapper->clEnqueueUnmapMemObject(cmd_queue_, srcBuffer_, mem, 0, NULL, NULL);
+}
+
+void OCLPerfImageCopySpeed::run(void) {
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {bufSize_, bufSize_, 1};
+
+  // Warm up
+  if (srcImage_ == false) {
+    error_ = _wrapper->clEnqueueCopyBufferToImage(
+        cmd_queue_, srcBuffer_, dstBuffer_, 0, origin, region, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueCopyBufferToImage failed");
+  } else if (dstImage_ == false) {
+    error_ = _wrapper->clEnqueueCopyImageToBuffer(
+        cmd_queue_, srcBuffer_, dstBuffer_, origin, region, 0, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueCopyImageToBuffer failed");
+  } else {
+    error_ =
+        _wrapper->clEnqueueCopyImage(cmd_queue_, srcBuffer_, dstBuffer_, origin,
+                                     origin, region, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueCopyImage failed");
+  }
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  CPerfCounter timer;
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < numIter; i++) {
+    if (srcImage_ == false) {
+      error_ = _wrapper->clEnqueueCopyBufferToImage(
+          cmd_queue_, srcBuffer_, dstBuffer_, 0, origin, region, 0, NULL, NULL);
+      CHECK_RESULT(error_, "clEnqueueCopyBufferToImage failed");
+    } else if (dstImage_ == false) {
+      error_ = _wrapper->clEnqueueCopyImageToBuffer(
+          cmd_queue_, srcBuffer_, dstBuffer_, origin, region, 0, 0, NULL, NULL);
+      CHECK_RESULT(error_, "clEnqueueCopyImageToBuffer failed");
+    } else {
+      error_ =
+          _wrapper->clEnqueueCopyImage(cmd_queue_, srcBuffer_, dstBuffer_,
+                                       origin, origin, region, 0, NULL, NULL);
+      CHECK_RESULT(error_, "clEnqueueCopyImage failed");
+    }
+  }
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Image copy bandwidth in GB/s
+  double perf = ((double)bufSize_ * bufSize_ * formatSize[bufnum_] * 2 *
+                 numIter * (double)(1e-09)) /
+                sec;
+
+  const char *strSrc = NULL;
+  const char *strDst = NULL;
+  if (srcImage_)
+    strSrc = "img";
+  else
+    strSrc = "buf";
+  if (dstImage_)
+    strDst = "img";
+  else
+    strDst = "buf";
+  void *mem;
+  size_t image_row_pitch;
+  size_t image_slice_pitch;
+  unsigned int memSize;
+  if (dstImage_) {
+    mem = _wrapper->clEnqueueMapImage(
+        cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ, origin, region,
+        &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapImage failed");
+    memSize = (unsigned int)image_row_pitch * bufSize_;
+  } else {
+    mem = _wrapper->clEnqueueMapBuffer(
+        cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ, 0,
+        bufSize_ * bufSize_ * formatSize[bufnum_], 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+    memSize = (unsigned int)bufSize_ * bufSize_ * formatSize[bufnum_];
+    image_row_pitch = 0;
+  }
+  checkData(mem, (unsigned int)image_row_pitch, memSize, 0x600df00d);
+  _wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL);
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s src:%s dst:%s i: %4d (GB/s) ",
+           bufSize_, bufSize_, textFormats[bufnum_], strSrc, strDst, numIter);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfImageCopySpeed::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+
+  if (srcBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(srcBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(srcBuffer_) failed");
+  }
+  if (dstBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(dstBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(dstBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopySpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopySpeed.h
new file mode 100644
index 0000000000..570ab9511e
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopySpeed.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_ImageCopySpeed_H_
+#define _OCL_ImageCopySpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfImageCopySpeed : public OCLTestImp {
+ public:
+  OCLPerfImageCopySpeed();
+  virtual ~OCLPerfImageCopySpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 1000;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem srcBuffer_;
+  cl_mem dstBuffer_;
+  cl_int error_;
+
+  unsigned int bufSize_;
+  unsigned int bufnum_;
+  bool srcImage_;
+  bool dstImage_;
+  unsigned int numIter;
+  void setData(void* ptr, unsigned int pitch, unsigned int size,
+               unsigned int value);
+  void checkData(void* ptr, unsigned int pitch, unsigned int size,
+                 unsigned int value);
+};
+
+#endif  // _OCL_ImageCopySpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCreate.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCreate.cpp
new file mode 100644
index 0000000000..7502b65aa0
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCreate.cpp
@@ -0,0 +1,194 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfImageCreate.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 4
+static const unsigned int Sizes[NUM_SIZES] = {256, 512, 1024, 2048};
+
+#if defined(CL_VERSION_2_0)
+#define NUM_FORMATS 3
+static const cl_image_format formats[NUM_FORMATS] = {
+    {CL_RGBA, CL_UNSIGNED_INT8},
+    {CL_sRGBA, CL_UNORM_INT8},
+    {CL_DEPTH, CL_UNORM_INT16}};
+static const char *textFormats[NUM_FORMATS] = {"CL_RGBA , CL_UNSIGNED_INT8",
+                                               "CL_sRGBA, CL_UNORM_INT8   ",
+                                               "CL_DEPTH, CL_UNORM_INT16  "};
+static const unsigned int formatSize[NUM_FORMATS] = {
+    sizeof(CL_UNSIGNED_INT8), sizeof(CL_UNORM_INT8), sizeof(CL_UNORM_INT16)};
+#else
+#define NUM_FORMATS 1
+static const cl_image_format formats[NUM_FORMATS] = {
+    {CL_RGBA, CL_UNSIGNED_INT8}};
+static const char *textFormats[NUM_FORMATS] = {"CL_RGBA, CL_UNSIGNED_INT8"};
+static const unsigned int formatSize[NUM_FORMATS] = {sizeof(CL_UNSIGNED_INT8)};
+#endif
+
+OCLPerfImageCreate::OCLPerfImageCreate() {
+  _numSubTests = NUM_SIZES * NUM_FORMATS;
+}
+
+OCLPerfImageCreate::~OCLPerfImageCreate() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfImageCreate::setData(void *ptr, unsigned int size,
+                                 unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  for (unsigned int i = 0; i < size >> 2; i++) {
+    ptr2[i] = value;
+    value++;
+  }
+}
+
+void OCLPerfImageCreate::open(unsigned int test, char *units,
+                              double &conversion, unsigned int deviceId) {
+  error_ = CL_SUCCESS;
+  testId_ = test;
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  program_ = 0;
+  kernel_ = 0;
+  cmd_queue_ = 0;
+  outBuffer_ = 0;
+  skip_ = false;
+
+  // check device version
+  size_t param_size = 0;
+  char *strVersion = 0;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
+                                     0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strVersion = new char[param_size];
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
+                                     param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strVersion[7] < '2') {
+    skip_ = true;
+    testDescString =
+        "sRGBA Image not supported for < 2.0 devices. Test Skipped.";
+    delete strVersion;
+    return;
+  }
+  delete strVersion;
+
+  bufSize_ = Sizes[test % NUM_SIZES];
+  bufnum_ = (test / NUM_SIZES) % NUM_FORMATS;
+  memSize = bufSize_ * bufSize_ * formatSize[bufnum_];
+  numIter = 100;
+
+  outBuffer_ = (cl_mem *)malloc(numIter * sizeof(cl_mem));
+  memptr = new char[memSize];
+
+  cmd_queue_ = cmdQueues_[_deviceId];
+}
+
+void OCLPerfImageCreate::run(void) {
+  if (skip_) {
+    return;
+  }
+
+  CPerfCounter timer;
+
+  cl_image_desc imageInfo;
+
+  memset(&imageInfo, 0x0, sizeof(cl_image_desc));
+
+  imageInfo.image_type = CL_MEM_OBJECT_IMAGE2D;
+  imageInfo.image_width = bufSize_;
+  imageInfo.image_height = bufSize_;
+  imageInfo.image_depth = 1;
+  imageInfo.image_array_size = 1;
+  imageInfo.image_row_pitch = bufSize_ * formatSize[bufnum_];
+  imageInfo.image_slice_pitch = imageInfo.image_row_pitch * (bufSize_);
+
+  setData(memptr, memSize, 0xdeadbeef);
+
+  char *dstmem = new char[memSize];
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {1, 1, 1};
+
+  timer.Reset();
+  timer.Start();
+
+  for (unsigned int i = 0; i < numIter; ++i) {
+    outBuffer_[i] =
+        clCreateImage(context_, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+                      &formats[bufnum_], &imageInfo, memptr, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "Error clCreateImage()");
+
+    error_ =
+        _wrapper->clEnqueueReadImage(cmd_queue_, outBuffer_[i], CL_TRUE, origin,
+                                     region, 0, 0, dstmem, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueReadImage failed");
+    _wrapper->clFinish(cmd_queue_);
+  }
+
+  timer.Stop();
+
+  delete dstmem;
+
+  double sec = timer.GetElapsedTime();
+
+  // Image create in GB/s
+  double perf = ((double)memSize * numIter * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  unsigned int fmt_num = (testId_ / NUM_SIZES) % NUM_FORMATS;
+  SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s(%1d) i: %4d (GB/s) ", bufSize_,
+           bufSize_, textFormats[fmt_num], formatSize[bufnum_], numIter);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfImageCreate::close(void) {
+  if (memptr) {
+    delete memptr;
+  }
+  if (outBuffer_) {
+    for (unsigned int i = 0; i < numIter; ++i) {
+      if (outBuffer_[i]) {
+        error_ = _wrapper->clReleaseMemObject(outBuffer_[i]);
+        CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                               "clReleaseMemObject(outBuffer_[i]) failed");
+      }
+    }
+  }
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCreate.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCreate.h
new file mode 100644
index 0000000000..5d717a5d12
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCreate.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_ImageCreate_H_
+#define _OCL_ImageCreate_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfImageCreate : public OCLTestImp {
+ public:
+  OCLPerfImageCreate();
+  virtual ~OCLPerfImageCreate();
+
+ public:
+  virtual void open(unsigned int test, char *units, double &conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+  virtual void setData(void *ptr, unsigned int size, unsigned int value);
+
+  cl_command_queue cmd_queue_;
+  cl_mem *outBuffer_;
+
+  unsigned int bufSize_;
+  unsigned int bufnum_;
+  unsigned int numIter;
+  char *memptr;
+  unsigned int memSize;
+  unsigned int testId_;
+
+  bool skip_;
+};
+
+#endif  // _OCL_ImageCreate_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageMapUnmap.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageMapUnmap.cpp
new file mode 100644
index 0000000000..926d5b3f65
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageMapUnmap.cpp
@@ -0,0 +1,333 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfImageMapUnmap.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 1
+static const unsigned int Sizes0[2] = {0xc0, 0x18a};
+
+#define NUM_FORMATS 1
+static const cl_image_format formats[NUM_FORMATS] = {{CL_R, CL_SNORM_INT16}};
+static const char *textFormats[NUM_FORMATS] = {"R16"};
+static const unsigned int formatSize[NUM_FORMATS] = {2 * sizeof(cl_uchar)};
+
+static const unsigned int Iterations[2] = {1, OCLPerfImageMapUnmap::NUM_ITER};
+
+#define NUM_SUBTESTS 1
+OCLPerfImageMapUnmap::OCLPerfImageMapUnmap() {
+  _numSubTests = NUM_SIZES * NUM_SUBTESTS * NUM_FORMATS * 1;
+}
+
+OCLPerfImageMapUnmap::~OCLPerfImageMapUnmap() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfImageMapUnmap::setData(void *ptr, unsigned int pitch,
+                                   unsigned int size, unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  value = 0;
+  for (unsigned int i = 0; i < size >> 2; i++) {
+    ptr2[i] = value;
+    value++;
+  }
+}
+
+void OCLPerfImageMapUnmap::checkData(void *ptr, unsigned int pitch,
+                                     unsigned int size, unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  value = 0;
+  for (unsigned int i = 0; i < size >> 2; i++) {
+    if (ptr2[i] != value) {
+      printf("Data validation failed at %d!  Got 0x%08x 0x%08x 0x%08x 0x%08x\n",
+             i, ptr2[i], ptr2[i + 1], ptr2[i + 2], ptr2[i + 3]);
+      printf("Expected 0x%08x 0x%08x 0x%08x 0x%08x\n", value, value, value,
+             value);
+      CHECK_RESULT(true, "Data validation failed!");
+      break;
+    }
+    value++;
+  }
+}
+
+void OCLPerfImageMapUnmap::open(unsigned int test, char *units,
+                                double &conversion, unsigned int deviceId) {
+  cl_uint typeOfDevice = type_;
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  size_t queryOut = 0;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  srcBuffer_ = 0;
+  dstBuffer_ = 0;
+  srcImage_ = false;
+  dstImage_ = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice,
+                                      0, NULL, &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    // if (num_devices > 0)
+    //{
+    //    platform = platforms[_platformIndex];
+    //    break;
+    //}
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+
+  bufnum_ = (_openTest / (NUM_SIZES * NUM_SUBTESTS)) % NUM_FORMATS;
+
+  srcImage_ = true;
+
+  dstImage_ = false;
+
+  numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS * NUM_FORMATS)];
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices,
+                                    devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  bufSizeW_ = Sizes0[0];
+  bufSizeH_ = Sizes0[1];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_WRITE_ONLY;
+  cl_mem_flags flags2 = CL_MEM_WRITE_ONLY;
+  void *mem;
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {bufSizeW_, bufSizeH_, 1};
+  size_t image_row_pitch;
+  size_t image_slice_pitch;
+  cl_image_desc imageInfo;
+
+  memset(&imageInfo, 0x0, sizeof(cl_image_desc));
+
+  imageInfo.image_type = CL_MEM_OBJECT_IMAGE2D;
+  imageInfo.image_width = bufSizeW_;
+  imageInfo.image_height = bufSizeH_;
+  imageInfo.image_depth = 1;
+  imageInfo.image_array_size = 1;
+  imageInfo.image_row_pitch = bufSizeW_ * formatSize[bufnum_];
+  imageInfo.image_slice_pitch = imageInfo.image_row_pitch * (bufSizeH_);
+
+  void *host_ptr = malloc(imageInfo.image_row_pitch * imageInfo.image_height);
+
+  unsigned int memSize;
+  if (dstImage_) {
+    dstBuffer_ =
+        _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], bufSizeW_,
+                                  bufSizeH_, 0, host_ptr, &error_);
+    CHECK_RESULT(dstBuffer_ == 0, "clCreateImage(dstBuffer) failed");
+    mem = _wrapper->clEnqueueMapImage(
+        cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region,
+        &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapImage failed");
+    memSize = (unsigned int)image_row_pitch * bufSizeH_;
+  } else {
+    dstBuffer_ = _wrapper->clCreateBuffer(
+        context_, flags2, bufSizeW_ * bufSizeH_ * formatSize[bufnum_], NULL,
+        &error_);
+    CHECK_RESULT(dstBuffer_ == 0, "clCreateBuffer(dstBuffer) failed");
+    mem = _wrapper->clEnqueueMapBuffer(
+        cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_WRITE, 0,
+        bufSizeW_ * bufSizeH_ * formatSize[bufnum_], 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+    memSize = (unsigned int)bufSizeW_ * bufSizeH_ * formatSize[bufnum_];
+    image_row_pitch = 0;
+  }
+  setData(mem, (unsigned int)image_row_pitch, memSize, 0xdeadbeef);
+  _wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL);
+
+  flags = CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR;
+  if (srcImage_) {
+    srcBuffer_ = _wrapper->clCreateImage(context_, flags, &formats[bufnum_],
+                                         &imageInfo, host_ptr, &error_);
+    CHECK_RESULT(srcBuffer_ == 0, "clCreateImage(srcBuffer) failed");
+    mem = _wrapper->clEnqueueMapImage(
+        cmd_queue_, srcBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region,
+        &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapImage failed");
+    memSize = (unsigned int)image_row_pitch * bufSizeH_;
+    error_ = _wrapper->clFinish(cmd_queue_);
+  } else {
+    srcBuffer_ = _wrapper->clCreateBuffer(
+        context_, flags, bufSizeW_ * bufSizeH_ * formatSize[bufnum_], NULL,
+        &error_);
+    CHECK_RESULT(srcBuffer_ == 0, "clCreateBuffer(srcBuffer) failed");
+    mem = _wrapper->clEnqueueMapBuffer(
+        cmd_queue_, srcBuffer_, CL_TRUE, CL_MAP_WRITE, 0,
+        bufSizeW_ * bufSizeH_ * formatSize[bufnum_], 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+    memSize = (unsigned int)bufSizeW_ * bufSizeH_ * formatSize[bufnum_];
+    image_row_pitch = 0;
+  }
+  setData(mem, (unsigned int)image_row_pitch, memSize, 0x600df00d);
+  _wrapper->clEnqueueUnmapMemObject(cmd_queue_, srcBuffer_, mem, 0, NULL, NULL);
+  error_ = _wrapper->clFinish(cmd_queue_);
+}
+
+void OCLPerfImageMapUnmap::run(void) {
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {bufSizeW_, bufSizeH_, 1};
+
+  if (srcImage_ == false) {
+    error_ = _wrapper->clEnqueueCopyBufferToImage(
+        cmd_queue_, srcBuffer_, dstBuffer_, 0, origin, region, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueCopyBufferToImage failed");
+  } else if (dstImage_ == false) {
+    error_ = _wrapper->clEnqueueCopyImageToBuffer(
+        cmd_queue_, srcBuffer_, dstBuffer_, origin, region, 0, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueCopyImageToBuffer failed");
+  } else {
+    error_ =
+        _wrapper->clEnqueueCopyImage(cmd_queue_, srcBuffer_, dstBuffer_, origin,
+                                     origin, region, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueCopyImage failed");
+  }
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  const char *strSrc = NULL;
+  const char *strDst = NULL;
+  if (srcImage_)
+    strSrc = "img";
+  else
+    strSrc = "buf";
+  if (dstImage_)
+    strDst = "img";
+  else
+    strDst = "buf";
+  void *mem;
+  size_t image_row_pitch;
+  size_t image_slice_pitch;
+  unsigned int memSize;
+  if (dstImage_) {
+    mem = _wrapper->clEnqueueMapImage(
+        cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ, origin, region,
+        &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapImage failed");
+    memSize = (unsigned int)image_row_pitch * bufSizeH_;
+  } else {
+    mem = _wrapper->clEnqueueMapBuffer(
+        cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ, 0,
+        bufSizeW_ * bufSizeH_ * formatSize[bufnum_], 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+    memSize = (unsigned int)bufSizeW_ * bufSizeH_ * formatSize[bufnum_];
+    image_row_pitch = 0;
+  }
+  checkData(mem, (unsigned int)image_row_pitch, memSize, 0x600df00d);
+  _wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL);
+  _perfInfo = 0;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s src:%s dst:%s i: %4d (GB/s) ",
+           bufSizeW_, bufSizeH_, textFormats[bufnum_], strSrc, strDst, numIter);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfImageMapUnmap::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+
+  if (srcBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(srcBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(srcBuffer_) failed");
+  }
+  if (dstBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(dstBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(dstBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageMapUnmap.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageMapUnmap.h
new file mode 100644
index 0000000000..9f061581de
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageMapUnmap.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_ImageMapUnmap_H_
+#define _OCL_ImageMapUnmap_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfImageMapUnmap : public OCLTestImp {
+ public:
+  OCLPerfImageMapUnmap();
+  virtual ~OCLPerfImageMapUnmap();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 1;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem srcBuffer_;
+  cl_mem dstBuffer_;
+  cl_int error_;
+
+  unsigned int bufSizeW_;
+  unsigned int bufSizeH_;
+  unsigned int bufnum_;
+  bool srcImage_;
+  bool dstImage_;
+  unsigned int numIter;
+  void setData(void* ptr, unsigned int pitch, unsigned int size,
+               unsigned int value);
+  void checkData(void* ptr, unsigned int pitch, unsigned int size,
+                 unsigned int value);
+};
+
+#endif  // _OCL_ImageMapUnmap_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadSpeed.cpp
new file mode 100644
index 0000000000..7f87c24515
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadSpeed.cpp
@@ -0,0 +1,295 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfImageReadSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 4
+static const unsigned int Sizes[NUM_SIZES] = {256, 512, 1024, 2048};
+
+#define NUM_FORMATS 1
+static const cl_image_format formats[NUM_FORMATS] = {
+    {CL_RGBA, CL_UNSIGNED_INT8}};
+static const char *textFormats[NUM_FORMATS] = {"R8G8B8A8"};
+static const unsigned int formatSize[NUM_FORMATS] = {4};
+
+static const unsigned int Iterations[2] = {1, OCLPerfImageReadSpeed::NUM_ITER};
+
+OCLPerfImageReadSpeed::OCLPerfImageReadSpeed() {
+  _numSubTests = NUM_SIZES * NUM_FORMATS * 2;
+}
+
+OCLPerfImageReadSpeed::~OCLPerfImageReadSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfImageReadSpeed::open(unsigned int test, char *units,
+                                 double &conversion, unsigned int deviceId) {
+  cl_uint typeOfDevice = type_;
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  outBuffer_ = 0;
+  memptr = NULL;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice,
+                                      0, NULL, &num_devices);
+    delete platforms;
+  }
+
+  bufSize_ = Sizes[_openTest % NUM_SIZES];
+  bufnum_ = (_openTest / NUM_SIZES) % NUM_FORMATS;
+  numIter = Iterations[_openTest / (NUM_SIZES * NUM_FORMATS)];
+
+  CHECK_RESULT(platform == 0, "Couldn't find platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices,
+                                    devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_WRITE_ONLY;
+  outBuffer_ = _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_],
+                                         bufSize_, bufSize_, 0, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateImage(outBuffer) failed");
+  memptr = new char[bufSize_ * bufSize_ * formatSize[bufnum_]];
+}
+
+void OCLPerfImageReadSpeed::run(void) {
+  CPerfCounter timer;
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {bufSize_, bufSize_, 1};
+  // Warm up
+  error_ = _wrapper->clEnqueueReadImage(cmd_queue_, outBuffer_, CL_TRUE, origin,
+                                        region, 0, 0, memptr, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueReadImage failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < numIter; i++) {
+    error_ =
+        _wrapper->clEnqueueReadImage(cmd_queue_, outBuffer_, CL_TRUE, origin,
+                                     region, 0, 0, memptr, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueReadImage failed");
+  }
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Image read bandwidth in GB/s
+  double perf = ((double)bufSize_ * bufSize_ * formatSize[bufnum_] * numIter *
+                 (double)(1e-09)) /
+                sec;
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s i: %4d (GB/s) ", bufSize_,
+           bufSize_, textFormats[bufnum_], numIter);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfImageReadSpeed::close(void) {
+  if (memptr) {
+    delete memptr;
+  }
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
+
+OCLPerfPinnedImageReadSpeed::OCLPerfPinnedImageReadSpeed() {
+  _numSubTests = NUM_SIZES * NUM_FORMATS * 2;
+}
+
+OCLPerfPinnedImageReadSpeed::~OCLPerfPinnedImageReadSpeed() {}
+
+void OCLPerfPinnedImageReadSpeed::open(unsigned int test, char *units,
+                                       double &conversion,
+                                       unsigned int deviceId) {
+  cl_uint typeOfDevice = type_;
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  outBuffer_ = 0;
+  memptr = NULL;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice,
+                                      0, NULL, &num_devices);
+    delete platforms;
+  }
+
+  bufSize_ = Sizes[_openTest % NUM_SIZES];
+  bufnum_ = (_openTest / NUM_SIZES) % NUM_FORMATS;
+  numIter = Iterations[_openTest / (NUM_SIZES * NUM_FORMATS)];
+
+  CHECK_RESULT(platform == 0, "Couldn't find platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices,
+                                    devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
+  inBuffer_ = _wrapper->clCreateBuffer(
+      context_, flags, bufSize_ * bufSize_ * formatSize[bufnum_], NULL,
+      &error_);
+  CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed");
+
+  flags = CL_MEM_WRITE_ONLY;
+  outBuffer_ = _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_],
+                                         bufSize_, bufSize_, 0, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateImage(outBuffer) failed");
+
+  memptr = (char *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, inBuffer_, CL_TRUE, CL_MAP_WRITE, 0,
+      bufSize_ * bufSize_ * formatSize[bufnum_], 0, NULL, NULL, &error_);
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+}
+
+unsigned int OCLPerfPinnedImageReadSpeed::close(void) {
+  if (memptr) {
+    error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, inBuffer_, memptr, 0,
+                                               NULL, NULL);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clEnqueueUnmapMemObject(inBuffer_) failed");
+    clFinish(cmd_queue_);
+  }
+  if (inBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(inBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadSpeed.h
new file mode 100644
index 0000000000..e1d8498610
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadSpeed.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_ImageReadSpeed_H_
+#define _OCL_ImageReadSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfImageReadSpeed : public OCLTestImp {
+ public:
+  OCLPerfImageReadSpeed();
+  virtual ~OCLPerfImageReadSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 100;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int bufSize_;
+  unsigned int bufnum_;
+  unsigned int numIter;
+  char* memptr;
+};
+
+class OCLPerfPinnedImageReadSpeed : public OCLPerfImageReadSpeed {
+ public:
+  OCLPerfPinnedImageReadSpeed();
+  virtual ~OCLPerfPinnedImageReadSpeed();
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual unsigned int close(void);
+
+  cl_mem inBuffer_;
+};
+#endif  // _OCL_ImageReadSpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadWrite.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadWrite.cpp
new file mode 100644
index 0000000000..3a668554f7
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadWrite.cpp
@@ -0,0 +1,223 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfImageReadWrite.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define KERNEL_CODE(...) #__VA_ARGS__
+
+#define NUM_SIZES 4
+static const unsigned int Sizes[NUM_SIZES] = {256, 512, 1024, 2048};
+
+#if defined(CL_VERSION_2_0)
+#define NUM_FORMATS 2
+static const cl_image_format formats[NUM_FORMATS] = {
+    {CL_RGBA, CL_UNSIGNED_INT8}, {CL_sRGBA, CL_UNORM_INT8}};
+static const char *textFormats[NUM_FORMATS] = {"CL_RGBA , CL_UNSIGNED_INT8",
+                                               "CL_sRGBA, CL_UNORM_INT8   "};
+static const unsigned int formatSize[NUM_FORMATS] = {sizeof(CL_UNSIGNED_INT8),
+                                                     sizeof(CL_UNORM_INT8)};
+#else
+#define NUM_FORMATS 1
+static const cl_image_format formats[NUM_FORMATS] = {
+    {CL_RGBA, CL_UNSIGNED_INT8}};
+static const char *textFormats[NUM_FORMATS] = {"CL_RGBA , CL_UNSIGNED_INT8"};
+static const unsigned int formatSize[NUM_FORMATS] = {sizeof(CL_UNSIGNED_INT8)};
+#endif
+
+const static char *strKernel = {KERNEL_CODE(
+  \n __constant sampler_t s_nearest = CLK_FILTER_NEAREST | CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE;
+  \n __kernel void image_kernel(read_write image2d_t image, uint zero) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+
+  int offset = y * get_image_width(image) + x;
+
+  int2 coords = (int2)(x, y);
+  uint4 tmp = read_imageui(image, s_nearest, coords);
+
+  write_imageui(image, coords, 1 + tmp * zero);
+}
+  \n)};
+
+OCLPerfImageReadWrite::OCLPerfImageReadWrite() {
+  _numSubTests = NUM_SIZES * NUM_FORMATS;
+}
+
+OCLPerfImageReadWrite::~OCLPerfImageReadWrite() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfImageReadWrite::setData(void *ptr, unsigned int size,
+                                    unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  for (unsigned int i = 0; i < size >> 2; i++) {
+    ptr2[i] = value;
+    value++;
+  }
+}
+
+void OCLPerfImageReadWrite::open(unsigned int test, char *units,
+                                 double &conversion, unsigned int deviceId) {
+  error_ = CL_SUCCESS;
+  testId_ = test;
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  program_ = 0;
+  kernel_ = 0;
+  cmd_queue_ = 0;
+  imageBuffer_ = 0;
+  skip_ = false;
+
+  // check device version
+  size_t param_size = 0;
+  char *strVersion = 0;
+  error_ = _wrapper->clGetDeviceInfo(
+      devices_[_deviceId], CL_DEVICE_OPENCL_C_VERSION, 0, 0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strVersion = new char[param_size];
+  error_ =
+      _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_OPENCL_C_VERSION,
+                                param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strVersion[9] < '2') {
+    skip_ = true;
+    testDescString =
+        "Image read_write qualifier not supported in OpenCL C < 2.0. Test "
+        "Skipped.";
+    delete strVersion;
+    return;
+  }
+  delete strVersion;
+
+  bufSize_ = Sizes[test % NUM_SIZES];
+  bufnum_ = (test / NUM_SIZES) % NUM_FORMATS;
+  memSize = bufSize_ * bufSize_ * formatSize[bufnum_];
+  numIter = 100;
+
+  memptr = new char[memSize];
+
+  cmd_queue_ = cmdQueues_[_deviceId];
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "image_kernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  // create image
+  setData(memptr, memSize, 0x0);
+  imageBuffer_ = _wrapper->clCreateImage2D(
+      context_, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, &formats[bufnum_],
+      bufSize_, bufSize_, 0, memptr, &error_);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clCreateImage2D() failed");
+
+  const unsigned int zero = 0;
+
+  // set kernel arguments
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &imageBuffer_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(unsigned int), &zero);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+}
+
+void OCLPerfImageReadWrite::run(void) {
+  if (skip_) {
+    return;
+  }
+
+  CPerfCounter timer;
+
+  size_t gws[2] = {bufSize_, bufSize_};
+  size_t lws[2] = {8, 8};
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, kernel_, 2, NULL, gws,
+                                            lws, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Reset();
+  timer.Start();
+
+  for (unsigned int i = 0; i < numIter; ++i) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, kernel_, 2, NULL, gws,
+                                              lws, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+    _wrapper->clFinish(cmd_queue_);
+  }
+
+  timer.Stop();
+
+  double sec = timer.GetElapsedTime();
+
+  // speed in GB/s
+  double perf = ((double)memSize * numIter * (double)(1e-09)) * 2 / sec;
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  unsigned int fmt_num = (testId_ / NUM_SIZES) % NUM_FORMATS;
+  SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s(%1d) i: %4d (GB/s) ", bufSize_,
+           bufSize_, textFormats[fmt_num], formatSize[bufnum_], numIter);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfImageReadWrite::close(void) {
+  if (!skip_) {
+    if (memptr) {
+      delete memptr;
+    }
+    if (imageBuffer_) {
+      error_ = _wrapper->clReleaseMemObject(imageBuffer_);
+      CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                             "clReleaseMemObject(imageBuffer_) failed");
+    }
+  }
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadWrite.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadWrite.h
new file mode 100644
index 0000000000..327786527c
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadWrite.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_ImageReadWrite
+#define _OCL_ImageReadWrite
+
+#include "OCLTestImp.h"
+
+class OCLPerfImageReadWrite : public OCLTestImp {
+ public:
+  OCLPerfImageReadWrite();
+  virtual ~OCLPerfImageReadWrite();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+  virtual void setData(void* ptr, unsigned int size, unsigned int value);
+
+  cl_command_queue cmd_queue_;
+  cl_mem imageBuffer_;
+
+  unsigned int bufSize_;
+  unsigned int bufnum_;
+  unsigned int numIter;
+  char* memptr;
+  unsigned int memSize;
+  unsigned int testId_;
+
+  bool skip_;
+};
+
+#endif  // _OCL_ImageReadWrite
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadsRGBA.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadsRGBA.cpp
new file mode 100644
index 0000000000..5ad33bc14c
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadsRGBA.cpp
@@ -0,0 +1,236 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfImageReadsRGBA.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define KERNEL_CODE(...) #__VA_ARGS__
+
+#define NUM_SIZES 4
+static const unsigned int Sizes[NUM_SIZES] = {256, 512, 1024, 2048};
+
+#if defined(CL_VERSION_2_0)
+#define NUM_FORMATS 2
+static const cl_image_format formats[NUM_FORMATS] = {
+    {CL_RGBA, CL_UNSIGNED_INT8}, {CL_sRGBA, CL_UNORM_INT8}};
+static const char *textFormats[NUM_FORMATS] = {"CL_RGBA , CL_UNSIGNED_INT8",
+                                               "CL_sRGBA, CL_UNORM_INT8   "};
+static const unsigned int formatSize[NUM_FORMATS] = {sizeof(CL_UNSIGNED_INT8),
+                                                     sizeof(CL_UNORM_INT8)};
+#else
+#define NUM_FORMATS 1
+static const cl_image_format formats[NUM_FORMATS] = {
+    {CL_RGBA, CL_UNSIGNED_INT8}};
+static const char *textFormats[NUM_FORMATS] = {"CL_RGBA , CL_UNSIGNED_INT8"};
+static const unsigned int formatSize[NUM_FORMATS] = {sizeof(CL_UNSIGNED_INT8)};
+#endif
+
+const static char *strKernel = {KERNEL_CODE(
+  \n __constant sampler_t s_nearest = CLK_FILTER_NEAREST | CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE;
+  \n
+        // Read sRGBA image object (input) and convert it to linear RGB values
+        // (results):
+            __kernel void image_kernel(read_only image2d_t input,
+                                       __global float4 *results) {
+              int x = get_global_id(0);
+              int y = get_global_id(1);
+
+              int offset = y * get_image_width(input) + x;
+
+              int2 coords = (int2)(x, y);
+              float4 tmp = read_imagef(input, s_nearest, coords);
+              if (x < 0 && tmp.x == 0.f) {
+                results[offset] = tmp;
+              }
+            }
+  \n)};
+
+OCLPerfImageReadsRGBA::OCLPerfImageReadsRGBA() {
+  _numSubTests = NUM_SIZES * NUM_FORMATS;
+}
+
+OCLPerfImageReadsRGBA::~OCLPerfImageReadsRGBA() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfImageReadsRGBA::setData(void *ptr, unsigned int size, float value) {
+  unsigned int *ptr_i = (unsigned int *)ptr;
+  for (unsigned int i = 0; i < size >> 2; i++) {
+    ptr_i[i] = (int)value;
+    value++;
+  }
+}
+
+void OCLPerfImageReadsRGBA::open(unsigned int test, char *units,
+                                 double &conversion, unsigned int deviceId) {
+  error_ = CL_SUCCESS;
+  testId_ = test;
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  program_ = 0;
+  kernel_ = 0;
+  cmd_queue_ = 0;
+  imageBuffer_ = 0;
+  valueBuffer_ = 0;
+  skip_ = false;
+
+  // check device version
+  size_t param_size = 0;
+  char *strVersion = 0;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
+                                     0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strVersion = new char[param_size];
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
+                                     param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strVersion[7] < '2') {
+    skip_ = true;
+    testDescString =
+        "sRGBA Image not supported for < 2.0 devices. Test Skipped.";
+    delete strVersion;
+    return;
+  }
+  delete strVersion;
+
+  bufSize_ = Sizes[test % NUM_SIZES];
+  bufnum_ = (test / NUM_SIZES) % NUM_FORMATS;
+  memSize = bufSize_ * bufSize_ * formatSize[bufnum_];
+  numIter = 100;
+
+  memptr = new char[memSize];
+
+  cmd_queue_ = cmdQueues_[_deviceId];
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "image_kernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  setData(memptr, memSize, 0.f);
+
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {bufSize_, bufSize_, 1};
+
+  // create image
+  imageBuffer_ = _wrapper->clCreateImage2D(
+      context_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &formats[bufnum_],
+      bufSize_, bufSize_, 0, memptr, &error_);
+  CHECK_RESULT(imageBuffer_ == 0, "clCreateImage2D(imageBuffer_) failed");
+
+  valueBuffer_ = clCreateBuffer(
+      context_, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, memSize, 0, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error clCreateBuffer()");
+
+  // set kernel arguments
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &imageBuffer_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), &valueBuffer_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+}
+
+void OCLPerfImageReadsRGBA::run(void) {
+  if (skip_) {
+    return;
+  }
+
+  CPerfCounter timer;
+
+  size_t gws[2] = {bufSize_, bufSize_};
+  size_t lws[2] = {8, 8};
+
+  // warm-up
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, kernel_, 2, NULL, gws,
+                                            lws, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Reset();
+  timer.Start();
+
+  for (unsigned int i = 0; i < numIter; ++i) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, kernel_, 2, NULL, gws,
+                                              lws, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+    _wrapper->clFinish(cmd_queue_);
+  }
+
+  timer.Stop();
+
+  double sec = timer.GetElapsedTime();
+
+  // read_imagef from sRGB to linear RGB speed in GB/s
+  double perf = ((double)memSize * numIter * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  unsigned int fmt_num = (testId_ / NUM_SIZES) % NUM_FORMATS;
+  SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s(%1d) i: %4d (GB/s) ", bufSize_,
+           bufSize_, textFormats[fmt_num], formatSize[bufnum_], numIter);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfImageReadsRGBA::close(void) {
+  if (memptr) {
+    delete memptr;
+  }
+  if (imageBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(imageBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(imageBuffer_) failed");
+  }
+  if (valueBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(valueBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(valueBuffer_) failed");
+  }
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadsRGBA.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadsRGBA.h
new file mode 100644
index 0000000000..60f0ad6b79
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadsRGBA.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_ImageReadsRGBA_H_
+#define _OCL_ImageReadsRGBA_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfImageReadsRGBA : public OCLTestImp {
+ public:
+  OCLPerfImageReadsRGBA();
+  virtual ~OCLPerfImageReadsRGBA();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+  virtual void setData(void* ptr, unsigned int size, float value);
+
+  cl_command_queue cmd_queue_;
+  cl_mem imageBuffer_;
+  cl_mem valueBuffer_;
+
+  unsigned int bufSize_;
+  unsigned int bufnum_;
+  unsigned int numIter;
+  char* memptr;
+  unsigned int memSize;
+  unsigned int testId_;
+
+  bool skip_;
+};
+
+#endif  // _OCL_ImageReadsRGBA_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageSampleRate.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageSampleRate.cpp
new file mode 100644
index 0000000000..f2a9933c78
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageSampleRate.cpp
@@ -0,0 +1,324 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfImageSampleRate.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_TYPES 6
+static const cl_image_format formats[NUM_TYPES] = {
+    {CL_R, CL_UNSIGNED_INT8},    {CL_RG, CL_UNSIGNED_INT8},
+    {CL_RGBA, CL_UNSIGNED_INT8}, {CL_R, CL_FLOAT},
+    {CL_RGBA, CL_HALF_FLOAT},    {CL_RGBA, CL_FLOAT}};
+static const char *types[NUM_TYPES] = {
+    "R8", "R8G8", "R8G8B8A8", "R32F", "R16G16B16A16F", "R32G32B32A32F"};
+static const unsigned int typeSizes[NUM_TYPES] = {1, 2, 4, 4, 8, 16};
+
+#define NUM_SIZES 12
+static const unsigned int sizes[NUM_SIZES] = {1,  2,   4,   8,   16,   32,
+                                              64, 128, 256, 512, 1024, 2048};
+
+#define NUM_BUFS 6
+#define MAX_BUFS (1 << (NUM_BUFS - 1))
+
+OCLPerfImageSampleRate::OCLPerfImageSampleRate() {
+  _numSubTests = NUM_TYPES * NUM_SIZES * NUM_BUFS;
+}
+
+OCLPerfImageSampleRate::~OCLPerfImageSampleRate() {}
+
+void OCLPerfImageSampleRate::setKernel(void) {
+  shader_.clear();
+  shader_ +=
+      "kernel void sampleRate(global float4* outBuffer, unsigned int "
+      "inBufSize, unsigned int writeIt,\n";
+  char buf[256];
+  for (unsigned int i = 0; i < numBufs_; i++) {
+    SNPRINTF(buf, sizeof(buf), "read_only image2d_t inBuffer%d", i);
+    shader_ += buf;
+    if (i < (numBufs_ - 1)) {
+      shader_ += ",";
+    }
+    shader_ += "\n";
+  }
+  shader_ += ")\n";
+  shader_ +=
+      "{\n"
+      "    uint gid = get_global_id(0);\n"
+      "    uint inputIdx = gid % inBufSize;\n"
+      "    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | "
+      "CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
+      "    float4 tmp = (float4)0.0f;\n";
+
+  for (unsigned int i = 0; i < numBufs_; i++) {
+    SNPRINTF(buf, sizeof(buf),
+             "    tmp += read_imagef(inBuffer%d, sampler, (int2)( gid %% "
+             "inBufSize, (gid / inBufSize) %% inBufSize));\n",
+             i);
+    shader_ += buf;
+  }
+  shader_ +=
+      "    if (writeIt*(unsigned int)tmp.x) outBuffer[gid] = tmp;\n"
+      "}\n";
+  // printf("Shader -> %s\n", shader_.c_str());
+}
+
+void OCLPerfImageSampleRate::setData(cl_mem buffer, unsigned int val) {
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {width_, width_, 1};
+  size_t image_row_pitch;
+  size_t image_slice_pitch;
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapImage(
+      cmd_queue_, buffer, true, CL_MAP_WRITE, origin, region, &image_row_pitch,
+      &image_slice_pitch, 0, NULL, NULL, &error_);
+  for (unsigned int i = 0; i < width_ * width_; i++) data[i] = val;
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+}
+
+void OCLPerfImageSampleRate::checkData(cl_mem buffer) {
+#if 0
+    float* data = (float *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_READ, 0, outBufSize_, 0, NULL, NULL, &error_);
+    for (unsigned int i = 0; i < outBufSize_/sizeof(float); i++)
+    {
+        if (data[i] != (float)numBufs_) {
+            printf("Data validation failed at %d! Got %f, expected %f\n", i, data[i], (float)numBufs_);
+            break;
+        }
+    }
+    error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, NULL);
+#endif
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfImageSampleRate::open(unsigned int test, char *units,
+                                  double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  inBuffer_ = 0;
+  outBuffer_ = 0;
+
+  // We compute a square domain
+  width_ = sizes[test % NUM_SIZES];
+  numBufs_ = (1 << ((test / NUM_SIZES) % NUM_BUFS));
+  typeIdx_ = (test / (NUM_SIZES * NUM_BUFS)) % NUM_TYPES;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+    platform = platforms[_platformIndex];
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    delete platforms;
+  }
+  /*
+   * If we could find a platform, use it.
+   */
+  CHECK_RESULT(platform == 0,
+               "Couldn't find platform with GPU devices, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  inBuffer_ = (cl_mem *)malloc(sizeof(cl_mem) * numBufs_);
+  memset(inBuffer_, 0, sizeof(cl_mem) * numBufs_);
+  for (unsigned int i = 0; i < numBufs_; i++) {
+    inBuffer_[i] = _wrapper->clCreateImage2D(context_, CL_MEM_READ_ONLY,
+                                             &formats[typeIdx_], width_, width_,
+                                             0, NULL, &error_);
+    CHECK_RESULT(inBuffer_[i] == 0, "clCreateImage2D(inBuffer) failed");
+  }
+
+  outBufSize_ = sizes[NUM_SIZES - 1] * sizes[NUM_SIZES - 1] * sizeof(cl_float4);
+  outBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
+                                        outBufSize_, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  setKernel();
+  char *tmp = (char *)shader_.c_str();
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&tmp, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  const char *buildOps = NULL;
+  error_ = _wrapper->clBuildProgram(program_, 1, &device, buildOps, NULL, NULL);
+
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "sampleRate", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer_);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(outBuffer) failed");
+  unsigned int sizeDW = width_;
+  error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(unsigned int),
+                                    (void *)&sizeDW);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(sizeDW) failed");
+  unsigned int writeIt = 0;
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(unsigned int),
+                                    (void *)&writeIt);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(writeIt) failed");
+  for (unsigned int i = 0; i < numBufs_; i++) {
+    error_ = _wrapper->clSetKernelArg(kernel_, i + 3, sizeof(cl_mem),
+                                      (void *)&inBuffer_[i]);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(inBuffer) failed");
+    // setData(inBuffer_[i], 0x3f800000);
+  }
+  // setData(outBuffer_, 0xdeadbeef);
+}
+
+void OCLPerfImageSampleRate::run(void) {
+  int global = outBufSize_ / typeSizes[typeIdx_];
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+  unsigned int maxIter = MAX_ITERATIONS * (MAX_BUFS / numBufs_);
+
+  CPerfCounter timer;
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < maxIter; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+  }
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // checkData(outBuffer_);
+  // Compute GB/s
+  double perf =
+      ((double)outBufSize_ * numBufs_ * (double)maxIter * (double)(1e-09)) /
+      sec;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), "Domain %dx%d,  %13s, %2d images,%4dx%4d (GB/s)",
+           sizes[NUM_SIZES - 1], sizes[NUM_SIZES - 1], types[typeIdx_],
+           numBufs_, width_, width_);
+
+  _perfInfo = (float)perf;
+  testDescString = buf;
+}
+
+unsigned int OCLPerfImageSampleRate::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+
+  if (inBuffer_) {
+    for (unsigned int i = 0; i < numBufs_; i++) {
+      if (inBuffer_[i]) {
+        error_ = _wrapper->clReleaseMemObject(inBuffer_[i]);
+        CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                               "clReleaseMemObject(inBuffer_) failed");
+      }
+    }
+    free(inBuffer_);
+  }
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageSampleRate.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageSampleRate.h
new file mode 100644
index 0000000000..3705538e51
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageSampleRate.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_IMAGESAMPLERATE_H_
+#define _OCL_IMAGESAMPLERATE_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfImageSampleRate : public OCLTestImp {
+ public:
+  OCLPerfImageSampleRate();
+  virtual ~OCLPerfImageSampleRate();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void setData(cl_mem buffer, unsigned int data);
+  void checkData(cl_mem buffer);
+  void setKernel(void);
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_mem* inBuffer_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int width_;
+  unsigned int outBufWidth_;
+  unsigned int outBufSize_;
+  static const unsigned int MAX_ITERATIONS = 25;
+  unsigned int numBufs_;
+  unsigned int typeIdx_;
+};
+
+#endif  // _OCL_IMAGESAMPLERATE_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageWriteSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageWriteSpeed.cpp
new file mode 100644
index 0000000000..3886d3cfe9
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageWriteSpeed.cpp
@@ -0,0 +1,317 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfImageWriteSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 4
+static const unsigned int Sizes[NUM_SIZES] = {256, 512, 1024, 2048};
+
+#define NUM_FORMATS 1
+static const cl_image_format formats[NUM_FORMATS] = {
+    {CL_RGBA, CL_UNSIGNED_INT8}};
+static const char *textFormats[NUM_FORMATS] = {"R8G8B8A8"};
+static const unsigned int formatSize[NUM_FORMATS] = {4};
+
+static const unsigned int Iterations[2] = {1, OCLPerfImageWriteSpeed::NUM_ITER};
+
+OCLPerfImageWriteSpeed::OCLPerfImageWriteSpeed() {
+  _numSubTests = NUM_SIZES * NUM_FORMATS * 2;
+}
+
+OCLPerfImageWriteSpeed::~OCLPerfImageWriteSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfImageWriteSpeed::open(unsigned int test, char *units,
+                                  double &conversion, unsigned int deviceId) {
+  cl_uint typeOfDevice = type_;
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  outBuffer_ = 0;
+  memptr = NULL;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice,
+                                      0, NULL, &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    // if (num_devices > 0)
+    //{
+    //    platform = platforms[_platformIndex];
+    //    break;
+    //}
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+
+  bufSize_ = Sizes[_openTest % NUM_SIZES];
+  bufnum_ = (_openTest / NUM_SIZES) % NUM_FORMATS;
+  numIter = Iterations[_openTest / (NUM_SIZES * NUM_FORMATS)];
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices,
+                                    devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_WRITE_ONLY;
+  outBuffer_ = _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_],
+                                         bufSize_, bufSize_, 0, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateImage(outBuffer) failed");
+  memptr = new char[bufSize_ * bufSize_ * formatSize[bufnum_]];
+}
+
+void OCLPerfImageWriteSpeed::run(void) {
+  CPerfCounter timer;
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {bufSize_, bufSize_, 1};
+  // Warm up
+  error_ =
+      _wrapper->clEnqueueWriteImage(cmd_queue_, outBuffer_, CL_TRUE, origin,
+                                    region, 0, 0, memptr, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueReadImage failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < numIter; i++) {
+    error_ =
+        _wrapper->clEnqueueWriteImage(cmd_queue_, outBuffer_, CL_TRUE, origin,
+                                      region, 0, 0, memptr, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueReadImage failed");
+  }
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Image write bandwidth in GB/s
+  double perf = ((double)bufSize_ * bufSize_ * formatSize[bufnum_] * numIter *
+                 (double)(1e-09)) /
+                sec;
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s i: %4d (GB/s) ", bufSize_,
+           bufSize_, textFormats[bufnum_], numIter);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfImageWriteSpeed::close(void) {
+  if (memptr) {
+    delete memptr;
+  }
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
+
+OCLPerfPinnedImageWriteSpeed::OCLPerfPinnedImageWriteSpeed() {
+  _numSubTests = NUM_SIZES * NUM_FORMATS * 2;
+}
+
+OCLPerfPinnedImageWriteSpeed::~OCLPerfPinnedImageWriteSpeed() {}
+
+void OCLPerfPinnedImageWriteSpeed::open(unsigned int test, char *units,
+                                        double &conversion,
+                                        unsigned int deviceId) {
+  cl_uint typeOfDevice = type_;
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  outBuffer_ = 0;
+  memptr = NULL;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice,
+                                      0, NULL, &num_devices);
+    delete platforms;
+  }
+
+  bufSize_ = Sizes[_openTest % NUM_SIZES];
+  bufnum_ = (_openTest / NUM_SIZES) % NUM_FORMATS;
+  numIter = Iterations[_openTest / (NUM_SIZES * NUM_FORMATS)];
+
+  CHECK_RESULT(platform == 0, "Couldn't find platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices,
+                                    devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
+  inBuffer_ = _wrapper->clCreateBuffer(
+      context_, flags, bufSize_ * bufSize_ * formatSize[bufnum_], NULL,
+      &error_);
+  CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed");
+
+  flags = CL_MEM_WRITE_ONLY;
+  outBuffer_ = _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_],
+                                         bufSize_, bufSize_, 0, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateImage(outBuffer) failed");
+
+  memptr = (char *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, inBuffer_, CL_TRUE, CL_MAP_WRITE, 0,
+      bufSize_ * bufSize_ * formatSize[bufnum_], 0, NULL, NULL, &error_);
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+}
+
+unsigned int OCLPerfPinnedImageWriteSpeed::close(void) {
+  if (memptr) {
+    error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, inBuffer_, memptr, 0,
+                                               NULL, NULL);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clEnqueueUnmapMemObject(inBuffer_) failed");
+    clFinish(cmd_queue_);
+  }
+  if (inBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(inBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageWriteSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageWriteSpeed.h
new file mode 100644
index 0000000000..20fec5124a
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageWriteSpeed.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_ImageWriteSpeed_H_
+#define _OCL_ImageWriteSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfImageWriteSpeed : public OCLTestImp {
+ public:
+  OCLPerfImageWriteSpeed();
+  virtual ~OCLPerfImageWriteSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 100;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int bufSize_;
+  unsigned int bufnum_;
+  unsigned int numIter;
+  char* memptr;
+};
+
+class OCLPerfPinnedImageWriteSpeed : public OCLPerfImageWriteSpeed {
+ public:
+  OCLPerfPinnedImageWriteSpeed();
+  virtual ~OCLPerfPinnedImageWriteSpeed();
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual unsigned int close(void);
+
+  cl_mem inBuffer_;
+};
+
+#endif  // _OCL_ImageWriteSpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelArguments.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelArguments.cpp
new file mode 100644
index 0000000000..96310d5b52
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelArguments.cpp
@@ -0,0 +1,239 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfKernelArguments.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include <sstream>
+#include <string>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+static const size_t BufSize = 0x1000;
+static const size_t Iterations = 0x10000;
+static const size_t TotalQueues = 4;
+static const size_t NumBufCnts = 4;
+static const size_t TotalArgs = 4;
+
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+static const char* Arguments[TotalArgs] = {
+    "__global uint* out",
+    "__global uint* out, __global uint* buf0, __global uint* buf1, __global "
+    "uint* buf2, __global uint* buf3",
+    "__global uint* out, __global uint* buf0, __global uint* buf1, __global "
+    "uint* buf2, __global uint* buf3, \n"
+    "__global uint* buf4, __global uint* buf5, __global uint* buf6, __global "
+    "uint* buf7, __global uint* buf8",
+    "__global uint* out, __global uint* buf0, __global uint* buf1, __global "
+    "uint* buf2, __global uint* buf3,\n"
+    "__global uint* buf4, __global uint* buf5, __global uint* buf6, __global "
+    "uint* buf7, __global uint* buf8,\n"
+    "__global uint* buf9, __global uint* buf10, __global uint* buf11, __global "
+    "uint* buf12, __global uint* buf13,\n"
+    "__global uint* buf14, __global uint* buf15, __global uint* buf16, "
+    "__global uint* buf17, __global uint* buf18"};
+
+static const char* strKernel =
+    "__kernel void dummy(%s)                    \n"
+    "{                                          \n"
+    "   uint id = get_global_id(0);             \n"
+    "   uint value = 1;                         \n"
+    "   out[id] = value;                        \n"
+    "}                                          \n";
+
+OCLPerfKernelArguments::OCLPerfKernelArguments() {
+  _numSubTests = TotalQueues * TotalArgs * NumBufCnts * 2;
+  failed_ = false;
+}
+
+OCLPerfKernelArguments::~OCLPerfKernelArguments() {}
+
+void OCLPerfKernelArguments::open(unsigned int test, char* units,
+                                  double& conversion, unsigned int deviceId) {
+  cl_mem buffer;
+  _deviceId = deviceId;
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  test_ = test;
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    printf("GPU device is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+  perBatch_ = test >= (TotalQueues * TotalArgs * NumBufCnts);
+
+  size_t numArguments = (test_ / TotalQueues) % TotalArgs;
+  char* program = new char[4096];
+  SNPRINTF(program, sizeof(char) * 4096, strKernel, Arguments[numArguments]);
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char**)&program, NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+  kernel_ = _wrapper->clCreateKernel(program_, "dummy", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  delete[] program;
+
+  static const size_t NumBuffs[NumBufCnts] = {0x20, 0x100, 0x800, 0x2000};
+
+  size_t numMems = NumBuffs[(test_ / (TotalQueues * TotalArgs)) % NumBufCnts];
+  size_t bufSize = BufSize * sizeof(cl_int4);
+  for (size_t b = 0; b < numMems; ++b) {
+    buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, bufSize,
+                                      NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+    buffers_.push_back(buffer);
+  }
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLPerfKernelArguments::run(void) {
+  if (failed_) {
+    return;
+  }
+  unsigned int* values;
+  values = reinterpret_cast<unsigned int*>(new cl_int4[BufSize]);
+  CPerfCounter timer;
+  static const size_t Queues[] = {1, 2, 4, 8};
+  size_t numQueues = Queues[test_ % TotalQueues];
+  cl_uint numArguments;
+  _wrapper->clGetKernelInfo(kernel_, CL_KERNEL_NUM_ARGS, sizeof(cl_uint),
+                            &numArguments, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clGetKernelInfo() failed");
+
+  // Clear destination buffer
+  memset(values, 0, BufSize * sizeof(cl_int4));
+
+  size_t iter = Iterations / numQueues / buffers_.size();
+  iter = (iter == 0) ? 1 : iter;
+
+  std::vector<cl_command_queue> cmdQueues(numQueues);
+  for (size_t q = 0; q < numQueues; ++q) {
+    cl_command_queue cmdQueue = _wrapper->clCreateCommandQueue(
+        context_, devices_[_deviceId], 0, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed");
+    cmdQueues[q] = cmdQueue;
+  }
+  // Warm-up
+  for (size_t b = 0; b < (buffers_.size() / numArguments); ++b) {
+    for (size_t q = 0; q < numQueues; ++q) {
+      for (cl_uint a = 0; a < numArguments; ++a) {
+        cl_mem buffer = buffers()[(b * numArguments + a) % buffers_.size()];
+        error_ = _wrapper->clSetKernelArg(kernel_, a, sizeof(cl_mem), &buffer);
+        CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+      }
+
+      size_t gws[1] = {256};
+      size_t lws[1] = {256};
+      error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL,
+                                                gws, lws, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+    }
+  }
+  for (size_t q = 0; q < numQueues; ++q) {
+    _wrapper->clFinish(cmdQueues[q]);
+  }
+
+  size_t disp = 0;
+  timer.Reset();
+  timer.Start();
+
+  for (size_t i = 0; i < iter; ++i) {
+    for (size_t b = 0; b < buffers_.size(); ++b) {
+      for (size_t q = 0; q < numQueues; ++q) {
+        for (cl_uint a = 0; a < numArguments; ++a) {
+          cl_mem buffer = buffers()[(b * numArguments + a) % buffers_.size()];
+          error_ =
+              _wrapper->clSetKernelArg(kernel_, a, sizeof(cl_mem), &buffer);
+          CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+        }
+
+        size_t gws[1] = {256};
+        size_t lws[1] = {256};
+        error_ = _wrapper->clEnqueueNDRangeKernel(
+            cmdQueues[q], kernel_, 1, NULL, gws, lws, 0, NULL, NULL);
+        CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+        disp++;
+        if (perBatch_) {
+          _wrapper->clFlush(cmdQueues[q]);
+        }
+      }
+      if (perBatch_) {
+        for (size_t q = 0; q < numQueues; ++q) {
+          _wrapper->clFinish(cmdQueues[q]);
+        }
+      }
+    }
+  }
+  for (size_t q = 0; q < numQueues; ++q) {
+    _wrapper->clFinish(cmdQueues[q]);
+  }
+  timer.Stop();
+
+  for (size_t q = 0; q < numQueues; ++q) {
+    error_ = _wrapper->clReleaseCommandQueue(cmdQueues[q]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseCommandQueue() failed");
+  }
+
+  std::stringstream stream;
+  if (perBatch_)
+    stream << "Time per batch    (us) for " << numQueues << " queues, ";
+  else
+    stream << "Time per dispatch (us) for " << numQueues << " queues, ";
+  stream.flags(std::ios::right | std::ios::showbase);
+  stream.width(2);
+  stream << numArguments;
+  stream << " args, ";
+  stream.flags(std::ios::right | std::ios::showbase);
+  stream.width(4);
+  stream << buffers_.size() << " bufs";
+  testDescString = stream.str();
+  _perfInfo = static_cast<float>(timer.GetElapsedTime() * 1000000 / disp);
+  delete[] values;
+}
+
+unsigned int OCLPerfKernelArguments::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelArguments.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelArguments.h
new file mode 100644
index 0000000000..997ac22e59
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelArguments.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PERF_KERNEL_ARGUMENTS_H_
+#define _OCL_PERF_KERNEL_ARGUMENTS_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfKernelArguments : public OCLTestImp {
+ public:
+  OCLPerfKernelArguments();
+  virtual ~OCLPerfKernelArguments();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+  unsigned int test_;
+  bool perBatch_;
+};
+
+#endif  // _OCL_PERF_KERNEL_ARGUMENTS_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.cpp
new file mode 100644
index 0000000000..860d016bb7
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.cpp
@@ -0,0 +1,1008 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfKernelThroughput.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <sstream>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+#define DO_GPU_KERNELS 1
+
+#if 0
+#define ENTER(X) printf("Entering %s\n", X);
+#define EXIT(X) printf("Exiting  %s\n", X);
+#define PKT(X) X
+#else
+#define ENTER(X)
+#define EXIT(X)
+#define PKT(X)
+#endif
+
+// work with multiples of 128
+#define ROUND_MULT(VAL, MULT) ((VAL / MULT) * MULT)
+/*
+int roundUp( int numToRound, int multiple)
+{
+    int r = numToRound % multiple;
+    if (r == 0)
+    {
+        return numToRound;
+    } else {
+        return numToRound + multiple - remainder;
+    }
+}
+*/
+// quiety warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define WORK_GROUP_SIZE 256
+
+/*******************************************************************************
+ * Enumerated Types for Tests
+ ******************************************************************************/
+
+// memory operations
+const LARGE_INT numKernelTypes = 2;
+static const char *kernelType[numKernelTypes] = {"MatMul", "Madds"};
+
+// source/read memory locations
+const LARGE_INT numMemPaths = 2;
+static const char *memPath[numMemPaths] = {"Host", "Device"};
+
+// buffer size
+const LARGE_INT numNumElements = 12;  // 15;
+static const LARGE_INT numElements[numNumElements] = {
+    4,       16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304,
+    16777216  //,
+    // 67108864,
+    // 268435456
+};
+
+// flops/byte
+const LARGE_INT numWorkSizes = 5;
+static const LARGE_INT workSize[numWorkSizes] = {1, 4, 16, 64, 256};
+
+const float initFloat = 0.001f;
+const float zeroFloat = 0.0f;
+
+#define WORK_GROUP_SIZE 256
+
+/*******************************************************************************
+ * Write the Matrix Multiply Shader Kernel
+ ******************************************************************************/
+void OCLPerfKernelThroughput::genShaderMatrixMultiply() {
+  ENTER("genShaderMatrixMultiply");
+
+  std::stringstream ss;
+  ss.clear();
+#if 0
+    printf("%ix%i * %ix%i = %ix%i:\n",
+            matrixDim1_, matrixDim2_,
+            matrixDim2_, matrixDim1_,
+            matrixDim1_, matrixDim1_
+            );
+#endif
+  ss << "#define BLOCK_SIZE 16\n"
+        "#define HA "
+     << matrixDim1_
+     << "\n"
+        "#define WA "
+     << matrixDim2_
+     << "\n"
+        "#define HB WA\n"
+        "#define WB HA\n"
+        "#define HC HA\n"
+        "#define WC WB\n"
+        "__kernel void\n"
+        "__attribute__((reqd_work_group_size(16,16,1)))\n"
+        "kernel1(\n"
+        "       __global float * restrict C,\n"
+        "       __global float * restrict A,\n"
+        "       __global float * restrict B )\n"
+        "{\n"
+        "   int bx = get_group_id(0);\n"
+        "   int by = get_group_id(1);\n"
+        "   int tx = get_local_id(0);\n"
+        "   int ty = get_local_id(1);\n"
+        "   int aBegin = WA * BLOCK_SIZE * by;\n"
+        "   int aEnd   = aBegin + WA - 1;\n"
+        "   int aStep  = BLOCK_SIZE;\n"
+        "   int bBegin = BLOCK_SIZE * bx;\n"
+        "   int bStep  = BLOCK_SIZE * WB;\n"
+        "   __private float c = 0.f;\n"
+        "   __local float localA[BLOCK_SIZE][BLOCK_SIZE];\n"
+        "   __local float localB[BLOCK_SIZE][BLOCK_SIZE];\n"
+        "   for (\n"
+        "           int a = aBegin, b = bBegin;\n"
+        "           a <= aEnd;\n"
+        "           a += aStep, b += bStep)\n"
+        "   {\n"
+        "       localA[ty][tx] = (get_global_id(0) < WA && get_global_id(1) < "
+        "HA) ? A[a + WA * ty + tx] : 0;\n"
+        "       localB[ty][tx] = (get_global_id(0) < WB && get_global_id(1) < "
+        "HB) ? B[b + WB * ty + tx] : 0;\n"
+        "       barrier(CLK_LOCAL_MEM_FENCE);\n"
+        "       for (int k = 0; k < BLOCK_SIZE; ++k)\n"
+        "           c += localA[ty][k] * localB[k][tx];\n"
+        "       barrier(CLK_LOCAL_MEM_FENCE);\n"
+        "   }\n"
+        "   int cIdx = WB * BLOCK_SIZE * by + BLOCK_SIZE * bx + WB * ty + tx;\n"
+        "   if (get_global_id(0) < WC && get_global_id(1) < WC)\n"
+        "   {\n"
+        "       C[cIdx] = c;\n"
+        "   }\n"
+        "}\n";
+
+  shader_ = ss.str();
+  gold_ = 0.f;
+  for (int i = 0; i < matrixDim2_; i++) gold_ += initFloat * initFloat;
+  // gold_ = initFloat * initFloat * matrixDim2_;
+  // printf("shader:\n%s\n", shader_.c_str());
+  // printf("gold_: %f\n", gold_);
+  EXIT("genShaderMatrixMultiply");
+}
+
+/*******************************************************************************
+ * Write the Madds Shader Kernel
+ ******************************************************************************/
+void OCLPerfKernelThroughput::genShaderMadds() {
+  ENTER("genShaderMadds");
+
+  int flopLoopIter = 2 * (flopsPerByte_ * 4 * 4) / 16;  // bytes, flops
+
+  std::stringstream ss;
+  ss.clear();
+  float a, b;
+
+  ss <<  // begin kernel
+      "__kernel void\n"
+      "__attribute__((reqd_work_group_size("
+     << 256
+     << ",1,1)))\n"
+        "kernel1(\n"
+        "   __global float4 * restrict input,\n"
+        "   __global float4 * restrict output )\n"
+        "{\n";
+
+  // begin loop
+  ss << "   for ( uint idx = get_global_id(0);\n"
+        "         idx < "
+     << numElements[numElementsIdx_]
+     << ";\n"
+        "         idx += get_global_size(0) )\n"
+        "   {\n";
+
+  // do load
+  ss << "       float4 prefetch = input[ idx ];\n"
+        "       float a0 = prefetch.x;\n"
+        "       float a1 = prefetch.y;\n"
+        "       float a2 = prefetch.z;\n"
+        "       float a3 = prefetch.w;\n"
+        "       float b0 = a0;\n"
+        "       float b1 = a1;\n"
+        "       float b2 = a2;\n"
+        "       float b3 = a3;\n";
+  a = initFloat;
+  b = a;
+
+  // do math
+  for (int i = 0; i < flopLoopIter; i++) {
+    ss << "       a0 += b3*b1;\n"
+          "       a1 += b0*b2;\n"
+          "       a2 += b1*b3;\n"
+          "       a3 += b2*b0;\n"
+          "       b0 += a3*a1;\n"
+          "       b1 += a0*a2;\n"
+          "       b2 += a1*a3;\n"
+          "       b3 += a2*a0;\n";
+    // printf("a += b*b; %f += %f*%f\n", a, b, b);
+    a += b * b;
+    // printf("b += a*a; %f += %f*%f\n", b, a, a);
+    b += a * a;
+  }
+
+  // do write or accumulate
+  ss << "       __private float4 tmp;\n"
+        "       tmp.x = b0;\n"
+        "       tmp.y = b1;\n"
+        "       tmp.z = b2;\n"
+        "       tmp.w = b3;\n"
+        "       output[ idx ] = tmp;\n";
+  gold_ = b;
+  // printf("GPU gold_ Tmp: %f\n", gold_);
+
+  // end loop
+  ss << "   } // end loop\n";
+  // end kernel
+  ss << " } // end kernel\n\n";
+
+  shader_ = ss.str();
+  // printf("shader:\n%s\n", shader_.c_str());
+  // printf("gold_: %f\n", gold_);
+  EXIT("genShaderMadds");
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+/*******************************************************************************
+ * Constructor
+ ******************************************************************************/
+OCLPerfKernelThroughput::OCLPerfKernelThroughput() {
+  ENTER("constructor");
+  _numSubTests = numKernelTypes * numMemPaths * numNumElements * numWorkSizes;
+
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  context_ = 0;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+    // Get last for default
+    platform = platforms[numPlatforms - 1];
+    for (unsigned i = 0; i < numPlatforms; ++i) {
+      char pbuf[100];
+      error_ = _wrapper->clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR,
+                                           sizeof(pbuf), pbuf, NULL);
+      num_devices = 0;
+      /* Get the number of requested devices */
+      error_ =
+          _wrapper->clGetDeviceIDs(platforms[i], type_, 0, NULL, &num_devices);
+      // Runtime returns an error when no GPU devices are present
+      // instead of just returning 0 devices
+      // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+      // Choose platform with GPU devices
+      if (num_devices > 0) {
+        // printf("NumDevices: %i\n", num_devices);
+        platform = platforms[i];
+        break;
+      }
+    }
+    delete platforms;
+  }
+
+  /*
+   * If we could find our platform, use it, else die.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  // get gpu speed
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY,
+                                     sizeof(maxClockFrequency_),
+                                     &maxClockFrequency_, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
+                                     sizeof(maxComputeUnits_),
+                                     &maxComputeUnits_, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (maxComputeUnits_ > 8) {
+    // printf("%i CUs reported; assuming 8 instead.", maxComputeUnits_);
+    maxComputeUnits_ = 8;
+  }
+  // printf("Compute Units: %i\n", maxComputeUnits_);
+
+  // printf("Subtests: %i\n", _numSubTests);
+
+  // create context
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  cl_uint tmp;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
+                                     sizeof(tmp), &tmp, NULL);
+  CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  // printf("NumComputeUnits: %u\n", tmp);
+  maxComputeUnits_ = static_cast<LARGE_INT>(tmp);
+  // printf("NumComputeUnits: %lld\n", maxComputeUnits_);
+  EXIT("constructor");
+}
+
+OCLPerfKernelThroughput::~OCLPerfKernelThroughput() {}
+
+/*******************************************************************************
+ * Open - initializes test, compile GPU kernel
+ ******************************************************************************/
+void OCLPerfKernelThroughput::open(unsigned int test, char *units,
+                                   double &conversion, unsigned int deviceId) {
+  ENTER("open");
+  /***********************************************************
+   * select subtest
+   **********************************************************/
+  int testIdx =
+      test + numKernelTypes * numMemPaths * numNumElements * numWorkSizes;
+  memPathIdx_ = testIdx % numMemPaths;
+  testIdx /= numMemPaths;
+  numElementsIdx_ = testIdx % numNumElements;
+  testIdx /= numNumElements;
+  workSizeIdx_ = testIdx % numWorkSizes;
+  testIdx /= numWorkSizes;
+  kernelTypeIdx_ = testIdx % numKernelTypes;
+  testIdx /= numKernelTypes;
+
+  // float md1;
+
+  // kernel values
+  switch (kernelTypeIdx_) {
+    case 0:  // Matrix Multiply
+      // md1 = sqrt(1.f*numElements[numElementsIdx_]);
+      // printf("MD1: sqrt(%f) = %f\n", 1.f*numElements[numElementsIdx_],md1);
+      matrixDim1_ = static_cast<int>(sqrt(1.f * numElements[numElementsIdx_]));
+      matrixDim2_ = matrixDim1_ * (int)workSize[workSizeIdx_];
+      genShaderMatrixMultiply();
+      work_dim_ = 2;
+      global_work_size_ = new size_t[work_dim_];
+      global_work_size_[0] = ((matrixDim1_ - 1) / 16 + 1) *
+                             16;  // matrixDim1_ < 16 ? 16 : matrixDim1_;
+      global_work_size_[1] = global_work_size_[0];
+      local_work_size_ = new size_t[work_dim_];
+      local_work_size_[0] = 16;
+      local_work_size_[1] = local_work_size_[0];
+      /*
+      printf("Global: %ix%i; Local: %ix%i; Matrix: %ix%i\n",
+              global_work_size_[0],
+              global_work_size_[1],
+              local_work_size_[0],
+              local_work_size_[1],
+              matrixDim1_,
+              matrixDim2_
+              );
+      */
+      input1BufferSize_ =
+          static_cast<size_t>(matrixDim1_ * matrixDim2_ * sizeof(float));
+      input2BufferSize_ =
+          static_cast<size_t>(matrixDim2_ * matrixDim1_ * sizeof(float));
+      output1BufferSize_ =
+          static_cast<size_t>(matrixDim1_ * matrixDim1_ * sizeof(float));
+      _reqDataSize = (1.0 * matrixDim1_ * matrixDim2_ * sizeof(float)) +
+                     (1.0 * matrixDim2_ * matrixDim1_ * sizeof(float)) +
+                     (1.0 * matrixDim1_ * matrixDim1_ * sizeof(float));
+      break;
+    case 1:                                         // Flops/Byte
+      flopsPerByte_ = (int)workSize[workSizeIdx_];  // for kernelType == 0
+      genShaderMadds();
+      numWorkGroupsPerComputeUnit_ = 32;  // TODO
+      numThreads_ =
+          numWorkGroupsPerComputeUnit_ * maxComputeUnits_ * WORK_GROUP_SIZE;
+      work_dim_ = 1;
+      global_work_size_ = new size_t[work_dim_];
+      local_work_size_ = new size_t[work_dim_];
+      global_work_size_[0] = numThreads_;
+      local_work_size_[0] = WORK_GROUP_SIZE;
+      input1BufferSize_ =
+          static_cast<size_t>(numElements[numElementsIdx_] * sizeof(float4));
+      input2BufferSize_ = 0;
+      output1BufferSize_ =
+          static_cast<size_t>(numElements[numElementsIdx_] * sizeof(float4));
+      _reqDataSize = 2.0 * numElements[numElementsIdx_] * sizeof(float4);
+      break;
+  }
+
+  PKT(printf("Test Parameters:\n"
+             "\tkernelTypeIdx: %i\n"
+             "\tmemPathIdx: %i\n"
+             "\tnumElementsIdx: %i\n"
+             "\tworkSizeIdx: %i\n"
+             "\n\n",
+             kernelTypeIdx_, memPathIdx_, numElementsIdx_, workSizeIdx_);)
+
+  /***********************************************************
+   * get context and queue
+   **********************************************************/
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0;
+  _deviceId = deviceId;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  input1Buffer_ = 0;
+  output1Buffer_ = 0;
+  _errorFlag = false;  // Reset error code so a single error
+                       // doesn't prevent other subtests from running
+  _errorMsg = "";
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present
+    // instead of just returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    delete platforms;
+  }
+
+  /*
+   * If we could find our platform, use it, else die.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /*
+   * Get the requested device
+   */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  device = devices[0];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device,
+                                              CL_QUEUE_PROFILING_ENABLE, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  // Global memory size
+  cl_ulong _maxMemoryAllocationSize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                                     sizeof(cl_ulong),
+                                     &_maxMemoryAllocationSize, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS,
+               "clGetDeviceIDs(CL_DEVICE_GLOBAL_MEM_SIZE) failed");
+#if 0
+    printf("Buffer Sizes: %i %i %i = %f\n",
+            input1BufferSize_,
+            input2BufferSize_,
+            output1BufferSize_,
+            _reqDataSize);
+#endif
+  _dataSizeTooBig = (_reqDataSize > _maxMemoryAllocationSize);
+  if (_dataSizeTooBig) {
+    // printf("DATA TOO LARGE FOR DEVICE !!!");
+    return;
+  }
+
+  // create kernel
+  char *tmp = (char *)shader_.c_str();
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&tmp, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  std::string args;
+  args.clear();
+  error_ =
+      _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "kernel1", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  /***********************************************************
+   * Allocate GPU Memory
+   **********************************************************/
+  cl_mem_flags inputBufferFlags = 0;
+  cl_mem_flags outputBufferFlags = 0;
+
+  // choose gpu source buffer type
+  switch (memPathIdx_) {
+    case 0:  // host memory
+      // printf("Allocating Host Memories\n");
+      // allocate "device" memory
+      inputBufferFlags = CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
+      outputBufferFlags = CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
+      input1Buffer_ = _wrapper->clCreateBuffer(
+          context_, inputBufferFlags, input1BufferSize_, NULL, &error_);
+      CHECK_RESULT(input1Buffer_ == 0, "clCreateBuffer Input failed");
+      if (input1Buffer_ == 0) printf("Error: %i\n", error_);
+      if (input2BufferSize_) {
+        input2Buffer_ = _wrapper->clCreateBuffer(
+            context_, inputBufferFlags, input2BufferSize_, NULL, &error_);
+        CHECK_RESULT(input2Buffer_ == 0, "clCreateBuffer Input failed");
+      }
+      output1Buffer_ = _wrapper->clCreateBuffer(
+          context_, outputBufferFlags, output1BufferSize_, NULL, &error_);
+      CHECK_RESULT(output1Buffer_ == 0, "clCreateBuffer Input failed");
+      if (output1Buffer_ == 0) printf("Error: %i\n", error_);
+
+      // map host memory
+      input1Ptr_ = (float *)_wrapper->clEnqueueMapBuffer(
+          cmd_queue_, input1Buffer_, true, CL_MAP_WRITE, 0, input1BufferSize_,
+          0, NULL, NULL, &error_);
+      if (input2BufferSize_) {
+        input2Ptr_ = (float *)_wrapper->clEnqueueMapBuffer(
+            cmd_queue_, input2Buffer_, true, CL_MAP_WRITE, 0, input2BufferSize_,
+            0, NULL, NULL, &error_);
+      }
+      output1Ptr_ = (float *)_wrapper->clEnqueueMapBuffer(
+          cmd_queue_, output1Buffer_, true, CL_MAP_READ, 0, output1BufferSize_,
+          0, NULL, NULL, &error_);
+      _wrapper->clFinish(cmd_queue_);
+      break;
+
+    case 1:  // device memory
+      // printf("Allocating Device Memories\n");
+      // allocate device memory
+      inputBufferFlags = CL_MEM_READ_WRITE;
+      outputBufferFlags = CL_MEM_READ_WRITE;
+      input1Buffer_ = _wrapper->clCreateBuffer(
+          context_, inputBufferFlags, input1BufferSize_, NULL, &error_);
+      CHECK_RESULT(input1Buffer_ == 0, "clCreateBuffer Input failed");
+      if (input2BufferSize_) {
+        input2Buffer_ = _wrapper->clCreateBuffer(
+            context_, inputBufferFlags, input2BufferSize_, NULL, &error_);
+        CHECK_RESULT(input2Buffer_ == 0, "clCreateBuffer Input failed");
+      }
+      output1Buffer_ = _wrapper->clCreateBuffer(
+          context_, outputBufferFlags, output1BufferSize_, NULL, &error_);
+      CHECK_RESULT(output1Buffer_ == 0, "clCreateBuffer Input failed");
+      // printf("\tDone Allocating Device Memory\n");
+
+      // allocate host memory
+      input1Ptr_ = new float[input1BufferSize_ / sizeof(float)];
+      if (input2BufferSize_) {
+        input2Ptr_ = new float[input2BufferSize_ / sizeof(float)];
+      }
+      output1Ptr_ = new float[output1BufferSize_ / sizeof(float)];
+      // printf("\tDone Allocating Host Memory\n");
+
+      break;
+    default:
+      CHECK_RESULT(1, "Invalid Memory Path Idx");
+      // invalid
+  }
+  for (unsigned int i = 0; i < input1BufferSize_ / sizeof(float); i++) {
+    input1Ptr_[i] = initFloat;
+  }
+  for (unsigned int i = 0; i < input2BufferSize_ / sizeof(float); i++) {
+    input2Ptr_[i] = initFloat;
+  }
+  for (unsigned int i = 0; i < output1BufferSize_ / sizeof(float); i++) {
+    output1Ptr_[i] = zeroFloat;
+  }
+
+#if 0
+    printf("Allocating GPU: %.0fMB, %.0fMB\n",
+            static_cast<float>(1.f*input1BufferSize_/1024.f/1024.f),
+            static_cast<float>(1.f*output1BufferSize_/1024.f/1024.f));
+    input1Buffer_ = _wrapper->clCreateBuffer(
+            context_, inputBufferFlags, input1BufferSize_, NULL, &error_);
+    CHECK_RESULT(input1Buffer_ == 0, "clCreateBuffer Input failed");
+    output1Buffer_ = _wrapper->clCreateBuffer(
+            context_, outputBufferFlags, output1BufferSize_, NULL, &error_);
+    CHECK_RESULT(output1Buffer_ == 0, "clCreateBuffer Output failed");
+    error_ = /*_wrapper->*/clEnqueueFillBuffer(
+            cmd_queue_, input1Buffer_, &initFloat, sizeof(initFloat),
+            0, input1BufferSize_, 0, NULL, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueFillBuffer failed");
+    error_ = /*_wrapper->*/clEnqueueFillBuffer(
+            cmd_queue_, output1Buffer_, &zeroFloat, sizeof(zeroFloat),
+            0, output1BufferSize_, 0, NULL, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueFillBuffer failed");
+
+    /***********************************************************
+     * Set Kernel Args
+     **********************************************************/
+    error_ = _wrapper->clSetKernelArg(
+                kernel_, 0, sizeof(input1Buffer_), (void *) &input1Buffer_);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
+    error_ = _wrapper->clSetKernelArg(
+                kernel_, 1, sizeof(output1Buffer_), (void *) &output1Buffer_);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
+#endif
+
+  EXIT("open");
+}
+
+/*******************************************************************************
+ * Run - execute full test once and return performance
+ ******************************************************************************/
+void OCLPerfKernelThroughput::run(void) {
+  ENTER("run");
+  CPerfCounter timer;
+  if (!_dataSizeTooBig) {
+    // set kernel args
+#if 1
+    switch (kernelTypeIdx_) {
+      case 0:  // Matrix Multiply
+        error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(output1Buffer_),
+                                          (void *)&output1Buffer_);
+        CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
+        error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(input1Buffer_),
+                                          (void *)&input1Buffer_);
+        CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
+        error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(input2Buffer_),
+                                          (void *)&input2Buffer_);
+        CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
+        break;
+      case 1:  // Flops/Byte
+        error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(input1Buffer_),
+                                          (void *)&input1Buffer_);
+        CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
+        error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(output1Buffer_),
+                                          (void *)&output1Buffer_);
+        CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
+        break;
+    }
+#endif
+    launchKernel();
+    timer.Reset();
+    timer.Start();
+    for (int i = 0; i < MAX_LOOP_ITER; i++) {
+      launchKernel();
+    }
+    timer.Stop();
+  }  // data not too large
+  double totalSec = _dataSizeTooBig ? 1 : timer.GetElapsedTime();
+  // printf("Total Time: %f seconds\n", totalSec);
+  // printf("Average Kernel Time: %f seconds\n", totalSec / MAX_LOOP_ITER);
+
+  // analyze performance
+  avgKernelTime_ = (float)(totalSec / MAX_LOOP_ITER * 1000000);  // microseconds
+  double flopCount;
+  switch (kernelTypeIdx_) {
+    case 0:  // Matrix Multiply
+      flopCount = (2.0 * matrixDim1_ * matrixDim1_ * matrixDim2_);
+      // printf("FlopCount = 2*%i*%i*%i=%f\n",
+      // matrixDim1_,matrixDim1_,matrixDim2_,flopCount);
+      bandwidth_ = (float)(1.f * _reqDataSize / 1024.f / 1024.f / 1024.f) *
+                   1000000.f / avgKernelTime_;  // GB/s
+      gflops_ = (float)(1000000.f * flopCount / avgKernelTime_ / 1000000000.0);
+      break;
+    case 1:  // Madds
+      flopCount = _reqDataSize * flopsPerByte_;
+      bandwidth_ = (float)(1.f * _reqDataSize / 1024.f / 1024.f / 1024.f) *
+                   1000000.f / avgKernelTime_;  // GB/s
+      gflops_ = bandwidth_ * flopsPerByte_;
+      break;
+  }
+  if (_dataSizeTooBig) {
+    printf("REQUESTED DATA SIZE EXCEEDS GLOBAL MEMORY !!!\n");
+    bandwidth_ = 0;
+    gflops_ = 0;
+    avgKernelTime_ = 0;
+  }
+  // here print out details
+  char buf[512];
+  int bytesWritten;
+  bytesWritten = SNPRINTF(
+      buf, sizeof(buf),
+      "Kernel:%7s; "
+      "Work:%4i; "
+      "Buff:%11.0f; "
+      "Path:%7s; "
+      "%10.5e GB/s; "
+      "%10.5e GFlop/s; ",
+      kernelType[kernelTypeIdx_], static_cast<int>(workSize[workSizeIdx_]),
+      _reqDataSize, memPath[memPathIdx_], bandwidth_, gflops_);
+  testDescString = buf;
+  _perfInfo = avgKernelTime_;
+  if (!_dataSizeTooBig) checkData();
+  EXIT("run");
+}
+
+void OCLPerfKernelThroughput::launchKernel(void) {
+  ENTER("launchKernel")
+  /***********************************************************
+   * Copy Data To
+   **********************************************************/
+  // printf("Copying Data To Device\n");
+  switch (memPathIdx_) {
+    case 0:  // zero copy
+      // do nothing
+      // void *inputPtr = _wrapper->clEnqueueMapBuffer(
+      //        cmd_queue_, input1Buffer_, true, CL_MAP_READ,
+      //        0, input1BufferSize_, 0, NULL, NULL, &error_);
+      // void *outputPtr = _wrapper->clEnqueueMapBuffer(
+      //        cmd_queue_, output1Buffer_, true, CL_MAP_READ,
+      //        0, output1BufferSize_, 0, NULL, NULL, &error_);
+      //_wrapper->clFinish(cmd_queue_);
+      break;
+    case 1:  // explicit copy to device memory
+      // printf("Queue:     %p\n", &cmd_queue_);
+      // printf("devBuffer: %i\n", input1Buffer_);
+      // printf("hstBuffer: %p\n", input1Ptr_);
+      // printf("bufSize:   %i\n", input1BufferSize_);
+      error_ = _wrapper->clEnqueueWriteBuffer(
+          cmd_queue_, input1Buffer_, true, 0, input1BufferSize_,
+          (const void *)input1Ptr_, 0, NULL, NULL);
+      if (input2BufferSize_) {
+        error_ = _wrapper->clEnqueueWriteBuffer(
+            cmd_queue_, input2Buffer_, true, 0, input2BufferSize_,
+            (const void *)input2Ptr_, 0, NULL, NULL);
+      }
+      // printf("Error: %i\n", error_);
+      std::fflush(stdout);
+      _wrapper->clFinish(cmd_queue_);
+      CHECK_RESULT(error_ != CL_SUCCESS, "clWriteBuffer failed");
+      //_error = _wrapper->clEnqueueWriteBuffer(
+      //        cmd_queue_, output1Buffer_, true, 0, output1BufferSize_,
+      //        (const void *)output1Ptr_, 0, NULL, NULL );
+      // CHECK_RESULT(error_ != CL_SUCCESS, "clWriteBuffer failed");
+      break;
+  }
+
+    /***********************************************************
+     * Set Kernel Args
+     **********************************************************/
+#if 0
+    error_ = _wrapper->clSetKernelArg(
+                kernel_, 0, sizeof(input1Buffer_), (void *) &input1Buffer_);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
+    error_ = _wrapper->clSetKernelArg(
+                kernel_, 1, sizeof(output1Buffer_), (void *) &output1Buffer_);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
+#endif
+
+  // printf("Launching Kernel: %ix%i threads\n", global_work_size_[0],
+  // local_work_size_[0]);
+
+  /***********************************************************
+   * Launch Kernel
+   **********************************************************/
+  error_ = _wrapper->clEnqueueNDRangeKernel(
+      cmd_queue_, kernel_, work_dim_, NULL, (const size_t *)global_work_size_,
+      (const size_t *)local_work_size_, 0, NULL, NULL);
+  // printf("Error: %i\n", error_);
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  _wrapper->clFinish(cmd_queue_);
+
+  /***********************************************************
+   * Copy Data From
+   **********************************************************/
+  // printf("Copying Data From Device\n");
+  switch (memPathIdx_) {
+    case 0:  // zero copy
+      // do nothing
+      // void *inputPtr = _wrapper->clEnqueueMapBuffer(
+      //        cmd_queue_, input1Buffer_, true, CL_MAP_READ,
+      //        0, input1BufferSize_, 0, NULL, NULL, &error_);
+      // void *outputPtr = _wrapper->clEnqueueMapBuffer(
+      //        cmd_queue_, output1Buffer_, true, CL_MAP_READ,
+      //        0, output1BufferSize_, 0, NULL, NULL, &error_);
+      //_wrapper->clFinish(cmd_queue_);
+      break;
+    case 1:  // explicit copy to device memory
+      //_error = _wrapper->clEnqueueReadBuffer(
+      //        cmd_queue_, input1Buffer_, true, 0, input1BufferSize_,
+      //        (void *)input1Ptr_, 0, NULL, NULL );
+      // CHECK_RESULT(error_ != CL_SUCCESS, "clWriteBuffer failed");
+      // printf("VAL0 %p
+      error_ = _wrapper->clEnqueueReadBuffer(
+          cmd_queue_, output1Buffer_, true, 0, output1BufferSize_,
+          (void *)output1Ptr_, 0, NULL, NULL);
+      // printf("Error: %i\n", error_);
+      CHECK_RESULT(error_ != CL_SUCCESS, "clWriteBuffer failed");
+      break;
+  }
+
+  EXIT("launchKernel")
+}
+
+/*******************************************************************************
+ * Check Data
+ ******************************************************************************/
+void OCLPerfKernelThroughput::checkData() {
+  _wrapper->clFinish(cmd_queue_);
+  float errorThreshhold = 0.00001f;
+  float eqMax = gold_ + errorThreshhold * gold_;
+  float eqMin = gold_ - errorThreshhold * gold_;
+  /*
+  printf("%ix%i * %ix%i = %ix%i:\n",
+          matrixDim1_, matrixDim2_,
+          matrixDim2_, matrixDim1_,
+          matrixDim1_, matrixDim1_
+          );
+  */
+  for (unsigned int i = 0; i < output1BufferSize_ / sizeof(float); i++) {
+    float value = output1Ptr_[i];
+    bool equal = (value > eqMin && value < eqMax);
+    if (!equal) {
+#if 0
+            printf("Output[%i] = %.6e; gold_ = %.6e; %s\n",
+                    i,
+                    value,
+                    gold_,
+                    equal ? "Equal" : "NOT Equal");
+#endif
+      // printf("FAILURE\n");
+      // CHECK_RESULT_NO_RETURN(1, "Data validation failed!\n");
+      _errorFlag = true;
+      break;
+    } else {
+      // printf("M[%i] = %.6e\n", i, output1Ptr_[i]);
+    }
+  }
+}
+
+/*******************************************************************************
+ * Close - delete all data and release opencl objects
+ ******************************************************************************/
+unsigned int OCLPerfKernelThroughput::close(void) {
+  ENTER("close");
+  _wrapper->clFinish(cmd_queue_);
+
+  if (global_work_size_) {
+    delete[] global_work_size_;
+    global_work_size_ = NULL;
+  }
+  if (local_work_size_) {
+    delete[] local_work_size_;
+    local_work_size_ = NULL;
+  }
+  // switch for memory type
+  switch (memPathIdx_) {
+    case 0:  // zero copy
+      // unmap ptr
+      if (input1Ptr_) {
+        error_ = /*_wrapper->*/ clEnqueueUnmapMemObject(
+            cmd_queue_, input1Buffer_, input1Ptr_, 0, NULL, NULL);
+        CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                               "clEnqueueUnmapMemObject(input_) failed");
+        _wrapper->clFinish(cmd_queue_);
+        error_ = _wrapper->clReleaseMemObject(input1Buffer_);
+        CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                               "clReleaseMemObject(input1Buffer_) failed");
+        input1Buffer_ = 0;
+      }
+      if (input2Ptr_) {
+        error_ = /*_wrapper->*/ clEnqueueUnmapMemObject(
+            cmd_queue_, input2Buffer_, input2Ptr_, 0, NULL, NULL);
+        CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                               "clEnqueueUnmapMemObject(input_) failed");
+        _wrapper->clFinish(cmd_queue_);
+        error_ = _wrapper->clReleaseMemObject(input2Buffer_);
+        CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                               "clReleaseMemObject(input2Buffer_) failed");
+        input2Buffer_ = 0;
+      }
+      if (output1Ptr_) {
+        error_ = /*_wrapper->*/ clEnqueueUnmapMemObject(
+            cmd_queue_, output1Buffer_, output1Ptr_, 0, NULL, NULL);
+        CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                               "clEnqueueUnmapMemObject(output_) failed");
+        _wrapper->clFinish(cmd_queue_);
+        error_ = _wrapper->clReleaseMemObject(output1Buffer_);
+        CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                               "clReleaseMemObject(input1Buffer_) failed");
+        output1Buffer_ = 0;
+      }
+      break;
+    case 1:  // explicit copy to device memory
+      // release object
+      if (input1Buffer_) {
+        error_ = _wrapper->clReleaseMemObject(input1Buffer_);
+        CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                               "clReleaseMemObject(input1Buffer_) failed");
+        input1Buffer_ = 0;
+      }
+      if (input2Buffer_) {
+        error_ = _wrapper->clReleaseMemObject(input2Buffer_);
+        CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                               "clReleaseMemObject(input2Buffer_) failed");
+        input2Buffer_ = 0;
+      }
+      if (output1Buffer_) {
+        error_ = _wrapper->clReleaseMemObject(output1Buffer_);
+        CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                               "clReleaseMemObject(input1Buffer_) failed");
+        output1Buffer_ = 0;
+      }
+      if (input1Ptr_) {
+        delete[] input1Ptr_;
+        input1Ptr_ = 0;
+      }
+      if (input2Ptr_) {
+        delete[] input2Ptr_;
+        input2Ptr_ = 0;
+      }
+      if (output1Ptr_) {
+        delete[] output1Ptr_;
+        output1Ptr_ = 0;
+      }
+      break;
+  }
+
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+    kernel_ = 0;
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+    program_ = 0;
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+    cmd_queue_ = 0;
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+    context_ = 0;
+  }
+  _wrapper->clFinish(cmd_queue_);
+
+  EXIT("close");
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.h
new file mode 100644
index 0000000000..84777a1cdd
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.h
@@ -0,0 +1,118 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+/*******************************************************************************
+ * Kernel Throughput
+ *
+ *
+ *
+ *
+ *
+ *
+ ******************************************************************************/
+
+#ifndef _OCL_KernelThroughput_H_
+#define _OCL_KernelThroughput_H_
+
+#ifdef WIN32
+#include "xmmintrin.h"
+#endif
+
+#include "OCLTestImp.h"
+//#include <sstream>
+//#define WIN32_LEAN_AND_MEAN //Restricts windows.h to include only the core
+//API. #include "windows.h" #undef Yield #include <process.h> #include
+//<xmmintrin.h> #include <emmintrin.h> #include <pmmintrin.h>
+
+#define LARGE_INT long long
+#define UNSIGNED_LARGE_INT unsigned long long
+#define MAX_LOOP_ITER 10
+typedef cl_float4 float4;
+typedef void (*CPUKernel)(__m128 *, __m128 *, unsigned int);
+
+class OCLPerfKernelThroughput : public OCLTestImp {
+ public:
+  OCLPerfKernelThroughput();
+  virtual ~OCLPerfKernelThroughput();
+
+ public:
+  virtual void open(unsigned int test, char *units, double &conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void genShaderMadds();
+  void genShaderMatrixMultiply();
+  void checkData();
+  // void allocateBuffers();
+  void launchKernel();
+
+  // test parameters
+  int kernelTypeIdx_;
+  int memPathIdx_;
+  int numElementsIdx_;
+  int workSizeIdx_;
+  float gold_;
+  double _reqDataSize;
+  bool _dataSizeTooBig;
+
+  // device attributes
+  cl_uint maxComputeUnits_;
+  cl_uint maxClockFrequency_;
+
+  LARGE_INT numComputeUnits_;
+  LARGE_INT numWorkGroupsPerComputeUnit_;
+  LARGE_INT numThreads_;
+  cl_uint work_dim_;
+  size_t *global_work_size_;
+  size_t *local_work_size_;
+
+  // opencl objects
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_int error_;
+
+  // buffer sizes
+
+  // kernel-specific values
+  int flopsPerByte_;
+  int matrixDim1_, matrixDim2_;
+
+  // buffers
+  size_t input1BufferSize_;
+  size_t input2BufferSize_;
+  size_t output1BufferSize_;
+  cl_mem input1Buffer_;
+  cl_mem input2Buffer_;
+  cl_mem output1Buffer_;
+  float *input1Ptr_;
+  float *input2Ptr_;
+  float *output1Ptr_;
+
+  // performance results
+  float bandwidth_;      // GB/s
+  float gflops_;         // GFlop/s
+  float avgKernelTime_;  // microseconds
+};
+
+#endif  // _OCL_KernelThroughput_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSLatency.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSLatency.cpp
new file mode 100644
index 0000000000..495f8c1a32
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSLatency.cpp
@@ -0,0 +1,432 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfLDSLatency.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+static const unsigned int NUM_SIZES = 5;
+// 2k up to 64MB
+static const unsigned int Sizes[NUM_SIZES] = {2048, 4096, 8192, 16384, 32768};
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+void OCLPerfLDSLatency::genShader() {
+  shader_.clear();
+
+  // DO NOT PUBLISH
+  // Adopted from SiSoft Sandra 2013's memory latency test
+  shader_ +=
+      "__kernel\n"
+      //"__attribute__((work_group_size_hint(1, 1, 1)))\n"
+      "void MemWalker(\n"
+      "    global uint * restrict input,\n"
+      "    global uint * restrict output,\n"
+      "    const uint uCount,  const uint uSize,\n"
+      "    const uint uOffset, const int bMem, const uint repeats)\n"
+      "{\n"
+      "    uint o = uOffset;\n"
+      "    uint lid = get_local_id(0);\n"
+      "    uint x = lid*o;\n"
+      "    local uint lclData[8192];\n"
+      "\n"
+      "    {\n"
+      "        uint i = uCount;\n"
+      "        while (i--) {\n"
+      "            uint oldX = x;\n"
+      "            x = input[x];\n"
+      "            lclData[oldX] = x;\n"
+      "        }\n"
+      "    }\n"
+      "\n"
+      "    x = lid*uOffset;\n"
+      "    for (uint loop = 0; loop < repeats; loop++) {\n"
+      "        uint i = uCount;\n"
+      "        while (i--) {\n"
+      "            x = lclData[x] + o;\n"
+      "        }\n"
+      "    }\n"
+      "\n"
+      "    output[0] = x;\n"
+      "}\n";
+
+  // printf("shader:\n%s\n", shader_.c_str());
+  shader_ += "\n\n";
+  shader_ +=
+      "__kernel\n"
+      //"__attribute__((work_group_size_hint(1, 1, 1)))\n"
+      "void Overhead(\n"
+      "    global uint * restrict input,\n"
+      "    global uint * restrict output,\n"
+      "    const uint uCount,  const uint uSize,\n"
+      "    const uint uOffset, const int bMem, const uint repeats)\n"
+      "{\n"
+      "    local uint lclData[8192];\n"
+      "#ifdef USE_FLOAT\n"
+      "    {\n"
+      "        uint x = 0;\n"
+      "        uint i = uCount;\n"
+      "        while (i--) {\n"
+      "            uint oldX = x;\n"
+      "            x = input[x] /* + o*/;\n"
+      "            lclData[oldX] = x;\n"
+      "        }\n"
+      "    }\n"
+      "    float x = (float)input[0];\n"
+      "    for (uint loop = 0; loop < repeats; loop++) {\n"
+      "        uint i = uCount;\n"
+      "        x = (float)uOffset*x;\n"
+      "        while (i--) {\n"
+      "            x += (float)i;\n"
+      "        }\n"
+      "    }\n"
+      "    output[0] = (uint)x + uOffset*lclData[8191];\n"
+      "#else\n"
+      "    {\n"
+      "        uint x = 0;\n"
+      "        uint i = uCount;\n"
+      "        while (i--) {\n"
+      "            uint oldX = x;\n"
+      "            x = input[x] /* + o*/;\n"
+      "            lclData[oldX] = x;\n"
+      "        }\n"
+      "    }\n"
+      "    uint x = input[0];\n"
+      "    for (uint loop = 0; loop < repeats; loop++) {\n"
+      "        uint i = uCount;\n"
+      "        x = x*uOffset;\n"
+      "        while (i--) {\n"
+      "            x += i;\n"
+      "        }\n"
+      "    }\n"
+      "    output[0] = x + uOffset*lclData[8191];\n"
+      "#endif\n"
+      "}\n";
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+OCLPerfLDSLatency::OCLPerfLDSLatency() {
+  _numSubTests = NUM_SIZES * 2;
+  maxSize_ = Sizes[NUM_SIZES - 1] * 2048;
+}
+
+OCLPerfLDSLatency::~OCLPerfLDSLatency() {}
+
+void OCLPerfLDSLatency::setData(cl_mem buffer, unsigned int val) {
+  void *ptr =
+      _wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_WRITE, 0,
+                                   width_, 0, NULL, NULL, &error_);
+  unsigned int *data = (unsigned int *)ptr;
+  for (unsigned int i = 0; i < bufSizeDW_; i++) {
+    data[(i * (1024 + 17)) % bufSizeDW_] = ((i + 1) * (1024 + 17)) % bufSizeDW_;
+  }
+  error_ =
+      _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL);
+  clFinish(cmd_queue_);
+}
+
+void OCLPerfLDSLatency::checkData(cl_mem buffer) {
+  void *ptr =
+      _wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_READ, 0,
+                                   sizeof(cl_uint), 0, NULL, NULL, &error_);
+
+  unsigned int *data = (unsigned int *)ptr;
+  if (data[0] != 0) {
+    printf("OutData= 0x%08x\n", data[0]);
+    CHECK_RESULT_NO_RETURN(data[0] != 0, "Data validation failed!\n");
+  }
+  error_ =
+      _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL);
+}
+
+void OCLPerfLDSLatency::open(unsigned int test, char *units, double &conversion,
+                             unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  moreThreads = false;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  inBuffer_ = 0;
+  outBuffer_ = 0;
+  _errorFlag = false;  // Reset error code so a single error doesn't prevent
+                       // other subtests from running
+  _errorMsg = "";
+  isAMD_ = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        isAMD_ = true;
+      }
+    }
+
+    delete platforms;
+  }
+
+  width_ = Sizes[test % NUM_SIZES];
+
+  bufSizeDW_ = width_ / sizeof(cl_uint);
+  moreThreads = ((test / NUM_SIZES) % 2) ? true : false;
+
+  CHECK_RESULT(platform == 0, "Couldn't find OpenCL platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "Failed to allocate devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  device = devices[0];
+
+  free(devices);
+  devices = NULL;
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_uint flags;
+  flags = 0;
+  inBuffer_ = _wrapper->clCreateBuffer(context_, flags, width_, NULL, &error_);
+  CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed");
+
+  outBuffer_ =
+      _wrapper->clCreateBuffer(context_, 0, 1 * sizeof(cl_uint), NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  genShader();
+  char *tmp = (char *)shader_.c_str();
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&tmp, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  std::string args;
+  args.clear();
+  if (isAMD_) args += " -D USE_FLOAT";
+
+  error_ =
+      _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "MemWalker", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel(MemWalker) failed");
+
+  kernel2_ = _wrapper->clCreateKernel(program_, "Overhead", &error_);
+  CHECK_RESULT(kernel2_ == 0, "clCreateKernel(Overhead) failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&inBuffer_);
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_);
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint),
+                                    (void *)&bufSizeDW_);
+  error_ = _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_uint),
+                                    (void *)&bufSizeDW_);
+  unsigned int zero = 0;
+  error_ = _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_uint), (void *)&zero);
+  int bMem = 1;
+  error_ = _wrapper->clSetKernelArg(kernel_, 5, sizeof(cl_int), (void *)&bMem);
+  // Limit the repeats, large buffers will have more samples, but the test runs
+  // for a long time
+  repeats_ = std::max((maxSize_ >> 4) / bufSizeDW_, 1u);
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 6, sizeof(cl_uint), (void *)&repeats_);
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel2_, 0, sizeof(cl_mem), (void *)&inBuffer_);
+  error_ = _wrapper->clSetKernelArg(kernel2_, 1, sizeof(cl_mem),
+                                    (void *)&outBuffer_);
+  error_ = _wrapper->clSetKernelArg(kernel2_, 2, sizeof(cl_uint),
+                                    (void *)&bufSizeDW_);
+  error_ = _wrapper->clSetKernelArg(kernel2_, 3, sizeof(cl_uint),
+                                    (void *)&bufSizeDW_);
+  error_ =
+      _wrapper->clSetKernelArg(kernel2_, 4, sizeof(cl_uint), (void *)&zero);
+  error_ = _wrapper->clSetKernelArg(kernel2_, 5, sizeof(cl_int), (void *)&bMem);
+  error_ =
+      _wrapper->clSetKernelArg(kernel2_, 6, sizeof(cl_uint), (void *)&repeats_);
+
+  setData(inBuffer_, (int)1.0f);
+}
+
+void OCLPerfLDSLatency::run(void) {
+  int global = 1;
+  int local = 1;
+
+  if (moreThreads) {
+    if (isAMD_) {
+      global *= 64;
+      local *= 64;
+    } else {
+      global *= 32;
+      local *= 32;
+    }
+  }
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  // Warm-up
+  unsigned int warmup = 128;
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint), (void *)&warmup);
+  error_ = _wrapper->clEnqueueNDRangeKernel(
+      cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+      (const size_t *)local_work_size, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint),
+                                    (void *)&bufSizeDW_);
+  _wrapper->clFinish(cmd_queue_);
+
+  // Restore input buffer when finished as it may have been modified by RW test
+  setData(inBuffer_, (int)1.0f);
+
+  CPerfCounter timer, timer2;
+
+  timer.Reset();
+  timer.Start();
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(
+      cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+      (const size_t *)local_work_size, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+
+  checkData(outBuffer_);
+
+  timer2.Reset();
+  timer2.Start();
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(
+      cmd_queue_, kernel2_, 1, NULL, (const size_t *)global_work_size,
+      (const size_t *)local_work_size, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+
+  _wrapper->clFinish(cmd_queue_);
+
+  timer2.Stop();
+  double sec = timer.GetElapsedTime() - timer2.GetElapsedTime();
+
+  // Read latency in ns
+  double perf = sec * (double)(1e09) / ((double)bufSizeDW_ * (double)repeats_);
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  char buf2[32];
+  buf2[0] = '\0';
+  SNPRINTF(buf, sizeof(buf), "%10s %2d threads, %8d reads, %5d repeats (ns)",
+           buf2, global, bufSizeDW_, repeats_);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfLDSLatency::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+
+  if (inBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(inBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(inBuffer_) failed");
+  }
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (kernel2_) {
+    error_ = _wrapper->clReleaseKernel(kernel2_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSLatency.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSLatency.h
new file mode 100644
index 0000000000..29eedfed79
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSLatency.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_LDSLATENCY_H_
+#define _OCL_LDSLATENCY_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfLDSLatency : public OCLTestImp {
+ public:
+  OCLPerfLDSLatency();
+  virtual ~OCLPerfLDSLatency();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void genShader(void);
+  void setData(cl_mem buffer, unsigned int data);
+  void checkData(cl_mem buffer);
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_kernel kernel2_;
+  cl_mem inBuffer_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int width_;
+  unsigned int bufSizeDW_;
+  unsigned int repeats_;
+  unsigned int maxSize_;
+  bool isAMD_;
+  bool moreThreads;
+};
+
+#endif  // _OCL_LDSLATENCY_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.cpp
new file mode 100644
index 0000000000..1bb9087b17
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.cpp
@@ -0,0 +1,395 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfLDSReadSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 4
+// 256KB, 1 MB, 4MB, 16 MB
+static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304,
+                                              16777216};
+
+void OCLPerfLDSReadSpeed::genShader(unsigned int idx) {
+  shader_.clear();
+  if (idx == 0) {
+    shader_ +=
+        "__kernel __attribute__((reqd_work_group_size(64,1,1))) void "
+        "_ldsReadSpeed(__global float *outBuf, float c)\n"
+        "{\n"
+        "    uint gid = (int) get_global_id(0);\n"
+        "    uint lid = (int) get_local_id(0);\n"
+        "    __local float localLocal[2048];\n"
+        "    float val1 = c;\n"
+        "    float val2 = c;\n"
+        "    float val3 = c;\n"
+        "    float val4 = c;\n"
+        "    uint hacklid = gid % 64;\n"
+        "    for (int i = 0; i < (2048/64); i++) {\n"
+        "        localLocal[hacklid + i*64] = lid;\n"
+        "    }\n"
+        "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+        "    val1 += localLocal[lid+0];\n"
+        "    val2 += localLocal[lid+64];\n"
+        "    val3 += localLocal[lid+128];\n"
+        "    val4 += localLocal[lid+192];\n"
+        "    val1 += localLocal[lid+256];\n"
+        "    val2 += localLocal[lid+320];\n"
+        "    val3 += localLocal[lid+384];\n"
+        "    val4 += localLocal[lid+448];\n"
+        "    val1 += localLocal[lid+512];\n"
+        "    val2 += localLocal[lid+576];\n"
+        "    val3 += localLocal[lid+640];\n"
+        "    val4 += localLocal[lid+704];\n"
+        "    val1 += localLocal[lid+768];\n"
+        "    val2 += localLocal[lid+832];\n"
+        "    val3 += localLocal[lid+896];\n"
+        "    val4 += localLocal[lid+960];\n"
+        "    val1 += localLocal[lid+1024];\n"
+        "    val2 += localLocal[lid+1088];\n"
+        "    val3 += localLocal[lid+1152];\n"
+        "    val4 += localLocal[lid+1216];\n"
+        "    val1 += localLocal[lid+1280];\n"
+        "    val2 += localLocal[lid+1344];\n"
+        "    val3 += localLocal[lid+1408];\n"
+        "    val4 += localLocal[lid+1472];\n"
+        "    val1 += localLocal[lid+1536];\n"
+        "    val2 += localLocal[lid+1600];\n"
+        "    val3 += localLocal[lid+1664];\n"
+        "    val4 += localLocal[lid+1728];\n"
+        "    val1 += localLocal[lid+1792];\n"
+        "    val2 += localLocal[lid+1856];\n"
+        "    val3 += localLocal[lid+1920];\n"
+        "    val4 += localLocal[lid+1984];\n"
+        "    outBuf[gid] = val1+val2+val3+val4;\n"
+        "}\n";
+    ldsSizeBytes_ = 2048 * 4;
+  } else if (idx == 1) {
+    shader_ +=
+        "__kernel __attribute__((reqd_work_group_size(64,1,1))) void "
+        "_ldsReadSpeed(__global float *outBuf, float c)\n"
+        "{\n"
+        "    uint gid = (uint) get_global_id(0);\n"
+        "    int lid = (int) get_local_id(0);\n"
+        "    __local float localLocal[768];\n"
+        "    float val0 = 0.0f;\n"
+        "    float val1 = 0.0f;\n"
+        "    uint hacklid = gid % 64;\n"
+        "    for (int i = 0; i < (768/64); i++) {\n"
+        "        localLocal[hacklid + i*64] = lid;\n"
+        "    }\n"
+        "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+        "#pragma nounroll\n"
+        "for (uint i = 0; i < 32;i++)\n"
+        "{\n"
+        "    val0 += localLocal[lid+0];\n"
+        "    val1 += localLocal[lid+64];\n"
+        "    val0 += localLocal[lid+128];\n"
+        "    val1 += localLocal[lid+192];\n"
+        "    val0 += localLocal[lid+256];\n"
+        "    val1 += localLocal[lid+320];\n"
+        "    val0 += localLocal[lid+384];\n"
+        "    val1 += localLocal[lid+448];\n"
+        "    lid += 1;\n"
+        "}\n"
+        "val0 += val1;\n"
+        "val1 = min(val0,1.0f);\n"
+        "if ((lid + val1) < 0){\n"
+        "    outBuf[gid] = val0;\n"
+        "}\n"
+        "}\n";
+    ldsSizeBytes_ = 768 * 4;
+  } else {
+    shader_ +=
+        "__kernel __attribute__((reqd_work_group_size(64,1,1))) void "
+        "_ldsReadSpeed(__global float *outBuf, float c)\n"
+        "{\n"
+        "    uint gid = (uint) get_global_id(0);\n"
+        "    int lid = (int) get_local_id(0);\n"
+        "    __local float localLocal[256];\n"
+        "    float val0 = 0.0f;\n"
+        "    float val1 = 0.0f;\n"
+        "    uint hacklid = gid % 64;\n"
+        "    for (int i = 0; i < (256/64); i++) {\n"
+        "        localLocal[hacklid + i*64] = lid;\n"
+        "    }\n"
+        "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+        "#pragma nounroll\n"
+        "for (uint i = 0; i < 32;i++)\n"
+        "{\n"
+        "    val0 += localLocal[8*i+0];\n"
+        "    val1 += localLocal[8*i+1];\n"
+        "    val0 += localLocal[8*i+2];\n"
+        "    val1 += localLocal[8*i+3];\n"
+        "    val0 += localLocal[8*i+4];\n"
+        "    val1 += localLocal[8*i+5];\n"
+        "    val0 += localLocal[8*i+6];\n"
+        "    val1 += localLocal[8*i+7];\n"
+        "}\n"
+        "val0 += val1;\n"
+        "val1 = min(val0,1.0f);\n"
+        "if ((lid + val1) < 0){\n"
+        "    outBuf[gid] = val0;\n"
+        "}\n"
+        "}\n";
+    ldsSizeBytes_ = 256 * 4;
+  }
+}
+
+OCLPerfLDSReadSpeed::OCLPerfLDSReadSpeed() { _numSubTests = NUM_SIZES * 3; }
+
+OCLPerfLDSReadSpeed::~OCLPerfLDSReadSpeed() {}
+
+void OCLPerfLDSReadSpeed::setData(cl_mem buffer, float val) {
+  float *data = (float *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true,
+                                                      CL_MAP_WRITE, 0, bufSize_,
+                                                      0, NULL, NULL, &error_);
+  for (unsigned int i = 0; i < (bufSize_ >> 2); i++) data[i] = val;
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+  _wrapper->clFinish(cmd_queue_);
+}
+
+void OCLPerfLDSReadSpeed::checkData(cl_mem buffer) {
+  float *data = (float *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true,
+                                                      CL_MAP_READ, 0, bufSize_,
+                                                      0, NULL, NULL, &error_);
+  for (unsigned int i = 0; i < (bufSize_ >> 2); i++) {
+    if (data[i] != (float)numReads_) {
+      printf("Data validation failed at index %d!\n", i);
+      printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, numReads_,
+             numReads_, numReads_, (unsigned int)data[i],
+             (unsigned int)data[i + 1], (unsigned int)data[i + 2],
+             (unsigned int)data[i + 3]);
+      CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
+      break;
+    }
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+  _wrapper->clFinish(cmd_queue_);
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfLDSReadSpeed::open(unsigned int test, char *units,
+                               double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  outBuffer_ = 0;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    // if (num_devices > 0)
+    //{
+    //    platform = platforms[_platformIndex];
+    //    break;
+    //}
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+
+  numReads_ = 32;
+  width_ = Sizes[test % NUM_SIZES];
+  shaderIdx_ = test / NUM_SIZES;
+
+  bufSize_ = width_;
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  genShader(shaderIdx_);
+  char *tmp = (char *)shader_.c_str();
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&tmp, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &device, "", NULL, NULL);
+
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "_ldsReadSpeed", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  float foo = 0;
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer_);
+  error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_float), (void *)&foo);
+
+  setData(outBuffer_, 1.2345678f);
+}
+
+void OCLPerfLDSReadSpeed::run(void) {
+  int global = bufSize_ / sizeof(cl_float);
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  CPerfCounter timer;
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < NUM_ITER; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  char buf[256];
+  const char *buf2;
+  if (shaderIdx_ == 0) {
+    buf2 = " def kernel";
+  } else if (shaderIdx_ == 1) {
+    buf2 = "SI friendly";
+    numReads_ *= 8;
+  } else {
+    buf2 = "  broadcast";
+    numReads_ *= 8;
+  }
+  // LDS bandwidth in GB/s
+  // We have one extra write per LDS location to initialize LDS
+  double perf =
+      ((double)global * (numReads_ * sizeof(cl_float) + ldsSizeBytes_ / 64) *
+       NUM_ITER * (double)(1e-09)) /
+      sec;
+
+  _perfInfo = (float)perf;
+  SNPRINTF(buf, sizeof(buf), " %s %8d threads, %3d reads (GB/s) ", buf2, global,
+           numReads_);
+  testDescString = buf;
+  // checkData(outBuffer_);
+}
+
+unsigned int OCLPerfLDSReadSpeed::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.h
new file mode 100644
index 0000000000..3214cb471f
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_LDSReadSpeed_H_
+#define _OCL_LDSReadSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfLDSReadSpeed : public OCLTestImp {
+ public:
+  OCLPerfLDSReadSpeed();
+  virtual ~OCLPerfLDSReadSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void genShader(unsigned int idx);
+  void setData(cl_mem buffer, float data);
+  void checkData(cl_mem buffer);
+
+  static const unsigned int NUM_ITER = 100;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int width_;
+  unsigned int bufSize_;
+  unsigned int vecSizeIdx_;
+  unsigned int numReads_;
+  unsigned int shaderIdx_;
+  unsigned int ldsSizeBytes_;
+};
+
+#endif  // _OCL_LDSReadSpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.cpp
new file mode 100644
index 0000000000..220ddf430c
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.cpp
@@ -0,0 +1,940 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfMandelbrot.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+typedef struct {
+  double x;
+  double y;
+  double width;
+} coordRec;
+
+coordRec coords[] = {
+    {0.0, 0.0, 4.0},                                     // Whole set
+    {0.0, 0.0, 0.00001},                                 // All black
+    {-0.0180789661868, 0.6424294066162, 0.00003824140},  // Hit detail
+};
+
+static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
+
+static const char *float_mandel =
+    "__kernel void mandelbrot(__global uint *out, uint width, float xPos, "
+    "float yPos, float xStep, float yStep, uint maxIter)\n"
+    "{\n"
+    "    int tid = get_global_id(0);\n"
+    "    int i = tid % width;\n"
+    "    int j = tid / width;\n"
+    "    float x0 = (float)(xPos + xStep*i);\n"
+    "    float y0 = (float)(yPos + yStep*j);\n"
+    "\n"
+    "    float x = x0;\n"
+    "    float y = y0;\n"
+    "\n"
+    "    uint iter = 0;\n"
+    "    float tmp;\n"
+    "    for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++)\n"
+    "    {\n"
+    "        tmp = x;\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "    }\n"
+    "    out[tid] = iter;\n"
+    "}\n";
+
+static const char *float_mandel_vec =
+    "__kernel void mandelbrot(__global uint *out, uint width, float xPos, "
+    "float yPos, float xStep, float yStep, uint maxIter)\n"
+    "{\n"
+    "    int tid = get_global_id(0);\n"
+    "    int i = tid % (width/4);\n"
+    "    int j = tid / (width/4);\n"
+    "    int4 veci = (int4)(4*i, 4*i+1, 4*i+2, 4*i+3);\n"
+    "    int4 vecj = (int4)(j, j, j, j);\n"
+    "    float4 x0;\n"
+    "    x0.s0 = (float)(xPos + xStep*veci.s0);\n"
+    "    x0.s1 = (float)(xPos + xStep*veci.s1);\n"
+    "    x0.s2 = (float)(xPos + xStep*veci.s2);\n"
+    "    x0.s3 = (float)(xPos + xStep*veci.s3);\n"
+    "    float4 y0;\n"
+    "    y0.s0 = (float)(yPos + yStep*vecj.s0);\n"
+    "    y0.s1 = (float)(yPos + yStep*vecj.s1);\n"
+    "    y0.s2 = (float)(yPos + yStep*vecj.s2);\n"
+    "    y0.s3 = (float)(yPos + yStep*vecj.s3);\n"
+    "\n"
+    "    float4 x = x0;\n"
+    "    float4 y = y0;\n"
+    "\n"
+    "    uint iter = 0;\n"
+    "    float4 tmp;\n"
+    "    int4 stay;\n"
+    "    int4 ccount = 0;\n"
+    "    float4 savx = x;\n"
+    "    float4 savy = y;\n"
+    "    stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n"
+    "    for (iter = 0; (stay.s0 | stay.s1 | stay.s2 | stay.s3) && (iter < "
+    "maxIter); iter+=16)\n"
+    "    {\n"
+    "        x = savx;\n"
+    "        y = savy;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n"
+    "        savx = select(savx,x,stay);\n"
+    "        savy = select(savy,y,stay);\n"
+    "        ccount -= stay*16;\n"
+    "    }\n"
+    "    // Handle remainder\n"
+    "    if (!(stay.s0 & stay.s1 & stay.s2 & stay.s3))\n"
+    "    {\n"
+    "        iter = 16;\n"
+    "        do\n"
+    "        {\n"
+    "            x = savx;\n"
+    "            y = savy;\n"
+    "            // More efficient to use scalar ops here: Why?\n"
+    "            stay.s0 = ((x.s0*x.s0+y.s0*y.s0) <= 4.0f) && (ccount.s0 < "
+    "maxIter);\n"
+    "            stay.s1 = ((x.s1*x.s1+y.s1*y.s1) <= 4.0f) && (ccount.s1 < "
+    "maxIter);\n"
+    "            stay.s2 = ((x.s2*x.s2+y.s2*y.s2) <= 4.0f) && (ccount.s2 < "
+    "maxIter);\n"
+    "            stay.s3 = ((x.s3*x.s3+y.s3*y.s3) <= 4.0f) && (ccount.s3 < "
+    "maxIter);\n"
+    "		     tmp = x;\n"
+    "            x = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "            y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "            ccount += stay;\n"
+    "            iter--;\n"
+    "            savx.s0 = (stay.s0 ? x.s0 : savx.s0);\n"
+    "            savx.s1 = (stay.s1 ? x.s1 : savx.s1);\n"
+    "            savx.s2 = (stay.s2 ? x.s2 : savx.s2);\n"
+    "            savx.s3 = (stay.s3 ? x.s3 : savx.s3);\n"
+    "            savy.s0 = (stay.s0 ? y.s0 : savy.s0);\n"
+    "            savy.s1 = (stay.s1 ? y.s1 : savy.s1);\n"
+    "            savy.s2 = (stay.s2 ? y.s2 : savy.s2);\n"
+    "            savy.s3 = (stay.s3 ? y.s3 : savy.s3);\n"
+    "        } while ((stay.s0 | stay.s1 | stay.s2 | stay.s3) && iter);\n"
+    "    }\n"
+    "    __global uint4 *vecOut = (__global uint4 *)out;\n"
+    "    vecOut[tid] = convert_uint4(ccount);\n"
+    "}\n";
+
+static const char *float_mandel_unroll =
+    "__kernel void mandelbrot(__global uint *out, uint width, float xPos, "
+    "float yPos, float xStep, float yStep, uint maxIter)\n"
+    "{\n"
+    "    int tid = get_global_id(0);\n"
+    "    int i = tid % width;\n"
+    "    int j = tid / width;\n"
+    "    float x0 = (float)(xPos + xStep*(float)i);\n"
+    "    float y0 = (float)(yPos + yStep*(float)j);\n"
+    "\n"
+    "    float x = x0;\n"
+    "    float y = y0;\n"
+    "\n"
+    "#define FAST\n"
+    "    uint iter = 0;\n"
+    "    float tmp;\n"
+    "    int stay;\n"
+    "    int ccount = 0;\n"
+    "    stay = (x*x+y*y) <= 4.0;\n"
+    "    float savx = x;\n"
+    "    float savy = y;\n"
+    "#ifdef FAST\n"
+    "    for (iter = 0; (iter < maxIter); iter+=16)\n"
+    "#else\n"
+    "    for (iter = 0; stay && (iter < maxIter); iter+=16)\n"
+    "#endif\n"
+    "    {\n"
+    "        x = savx;\n"
+    "        y = savy;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        stay = (x*x+y*y) <= 4.0;\n"
+    "        savx = select(savx,x,stay);\n"
+    "        savy = select(savy,y,stay);\n"
+    "        ccount += stay*16;\n"
+    "#ifdef FAST\n"
+    "        if (!stay)\n"
+    "            break;\n"
+    "#endif\n"
+    "    }\n"
+    "    // Handle remainder\n"
+    "    if (!stay)\n"
+    "    {\n"
+    "        iter = 16;\n"
+    "        do\n"
+    "        {\n"
+    "            x = savx;\n"
+    "            y = savy;\n"
+    "            stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);\n"
+    "            tmp = x;\n"
+    "            x = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "            y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "            ccount += stay;\n"
+    "            iter--;\n"
+    "            savx = select(savx,x,stay);\n"
+    "            savy = select(savy,y,stay);\n"
+    "         } while (stay && iter);\n"
+    "    }\n"
+    "    out[tid] = (uint)ccount;\n"
+    "}\n";
+
+static const char *double_mandel =
+    "#ifdef USE_CL_AMD_FP64\n"
+    "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
+    "#endif\n"
+    "#ifdef USE_CL_KHR_FP64\n"
+    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+    "#endif\n"
+    "__kernel void mandelbrot(__global uint *out, uint width, double xPos, "
+    "double yPos, double xStep, double yStep, uint maxIter)\n"
+    "{\n"
+    "    int tid = get_global_id(0);\n"
+    "    int i = tid % width;\n"
+    "    int j = tid / width;\n"
+    "    double x0 = (double)(xPos + xStep*i);\n"
+    "    double y0 = (double)(yPos + yStep*j);\n"
+    "\n"
+    "    double x = x0;\n"
+    "    double y = y0;\n"
+    "\n"
+    "    uint iter = 0;\n"
+    "    double tmp;\n"
+    "    for (iter = 0; (x*x + y*y <= 4.0) && (iter < maxIter); iter++)\n"
+    "    {\n"
+    "        tmp = x;\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "    }\n"
+    "    out[tid] = iter;\n"
+    "}\n";
+
+static const char *double_mandel_unroll =
+    "#ifdef USE_CL_AMD_FP64\n"
+    "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
+    "#endif\n"
+    "#ifdef USE_CL_KHR_FP64\n"
+    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+    "#endif\n"
+    "__kernel void mandelbrot(__global uint *out, uint width, double xPos, "
+    "double yPos, double xStep, double yStep, uint maxIter)\n"
+    "{\n"
+    "    int tid = get_global_id(0);\n"
+    "    int i = tid % width;\n"
+    "    int j = tid / width;\n"
+    "    double x0 = (double)(xPos + xStep*(double)i);\n"
+    "    double y0 = (double)(yPos + yStep*(double)j);\n"
+    "\n"
+    "    double x = x0;\n"
+    "    double y = y0;\n"
+    "\n"
+    "#define FAST\n"
+    "    uint iter = 0;\n"
+    "    double tmp;\n"
+    "    int stay;\n"
+    "    int ccount = 0;\n"
+    "    stay = (x*x+y*y) <= 4.0;\n"
+    "    double savx = x;\n"
+    "    double savy = y;\n"
+    "#ifdef FAST\n"
+    "    for (iter = 0; (iter < maxIter); iter+=16)\n"
+    "#else\n"
+    "    for (iter = 0; stay && (iter < maxIter); iter+=16)\n"
+    "#endif\n"
+    "    {\n"
+    "        x = savx;\n"
+    "        y = savy;\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        // Two iterations\n"
+    "        tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*x,y,y0);\n"
+    "        x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n"
+    "        y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "\n"
+    "        stay = (x*x+y*y) <= 4.0;\n"
+    "        savx = (stay ? x : savx);//select(savx,x,stay);\n"
+    "        savy = (stay ? y : savy);//select(savy,y,stay);\n"
+    "        ccount += stay*16;\n"
+    "#ifdef FAST\n"
+    "        if (!stay)\n"
+    "            break;\n"
+    "#endif\n"
+    "    }\n"
+    "    // Handle remainder\n"
+    "    if (!stay)\n"
+    "    {\n"
+    "        iter = 16;\n"
+    "        do\n"
+    "        {\n"
+    "            x = savx;\n"
+    "            y = savy;\n"
+    "            stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);\n"
+    "            tmp = x;\n"
+    "            x = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n"
+    "            y = MUL_ADD_INS(2.0f*tmp,y,y0);\n"
+    "            ccount += stay;\n"
+    "            iter--;\n"
+    "            savx = (stay ? x : savx);//select(savx,x,stay);\n"
+    "            savy = (stay ? y : savy);//select(savy,y,stay);\n"
+    "         } while (stay && iter);\n"
+    "    }\n"
+    "    out[tid] = (uint)ccount;\n"
+    "}\n";
+
+static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15;
+
+// Expected results for each kernel run at each coord
+unsigned long long expectedIters[] = {
+    203277748ull,  2147483648ull, 120254651ull,  203277748ull,  2147483648ull,
+    120254651ull,  203277748ull,  2147483648ull, 120254651ull,  203315114ull,
+    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull,
+    203280620ull,  2147483648ull, 120485704ull,  203280620ull,  2147483648ull,
+    120485704ull,  203280620ull,  2147483648ull, 120485704ull,  203315114ull,
+    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull};
+
+// nvidia supports CL_KHR_FP64, so they get better results for doubles.  Not
+// sure why we differ in floats though
+unsigned long long expectedItersNV[] = {
+    203277748ull,  2147483648ull, 120254651ull,  203277748ull,
+    2147483648ull, 120254651ull,  203277748ull,  2147483648ull,
+    120254651ull,  203315226ull,  2147483648ull, 120091921ull,
+    203315226ull,  2147483648ull, 120091921ull,  // end of mad
+    203280620ull,  2147483648ull, 120485704ull,  203280620ull,
+    2147483648ull, 120485704ull,  203280620ull,  2147483648ull,
+    120485704ull,  203315114ull,  2147483648ull, 120042599ull,
+    203315114ull,  2147483648ull, 120042599ull};
+
+const char *shaderStr[] = {"        float_mad", " float_vector_mad",
+                           " float_unroll_mad", "       double_mad",
+                           "double_unroll_mad", "        float_fma",
+                           " float_vector_fma", " float_unroll_fma",
+                           "       double_fma", "double_unroll_fma"};
+
+OCLPerfMandelbrot::OCLPerfMandelbrot() { _numSubTests = 10 * numCoords; }
+
+OCLPerfMandelbrot::~OCLPerfMandelbrot() {}
+
+void OCLPerfMandelbrot::setData(cl_mem buffer, unsigned int val) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  for (unsigned int i = 0; i < width_ * width_; i++) data[i] = val;
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+}
+
+void OCLPerfMandelbrot::checkData(cl_mem buffer) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  for (unsigned int i = 0; i < width_ * width_; i++) {
+    totalIters += data[i];
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfMandelbrot::open(unsigned int test, char *units, double &conversion,
+                             unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+  skip = false;
+  totalIters = 0;
+  isAMD = false;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  outBuffer_ = 0;
+
+  // Maximum iteration count
+  // NOTE: Some kernels are unrolled 16 times, so make sure maxIter is divisible
+  // by 16 NOTE: Can increase to get better peak performance numbers, but be
+  // sure not to TDR slow ASICs!
+  unsigned int maxIter = 32768;
+
+  // NOTE: Width needs to be divisible by 4 because the float_mandel_vec kernel
+  // processes 4 pixels at once NOTE: Can increase to get better peak
+  // performance numbers, but be sure not to TDR slow ASICs!
+  width_ = 256;
+
+  // We compute a square domain
+  bufSize_ = width_ * width_ * sizeof(cl_uint);
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+    // Get last for default
+#if 0
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+#if 0
+            if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+                platform = platforms[i];
+                break;
+            }
+#endif
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        isAMD = true;
+      }
+      platform = platforms[_platformIndex];
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0,
+               "Couldn't find platform with GPU devices, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  doubleSupport = false;
+
+  char *p = strstr(charbuf, "cl_amd_fp64");
+  char *p2 = strstr(charbuf, "cl_khr_fp64");
+
+  if (p || p2)
+    doubleSupport = true;
+  else
+    doubleSupport = false;
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  const char *tmp;
+  shaderIdx = _openTest / numCoords;
+  if ((doubleSupport != true) && ((shaderIdx == 3) || (shaderIdx == 4) ||
+                                  (shaderIdx == 8) || (shaderIdx == 9))) {
+    // We don't support doubles, so skip those tests
+    skip = true;
+    _perfInfo = 0.0f;
+    return;
+  }
+
+  if (shaderIdx == 0 || shaderIdx == 5) {
+    tmp = float_mandel;
+  } else if (shaderIdx == 1 || shaderIdx == 6) {
+    tmp = float_mandel_vec;
+  } else if (shaderIdx == 2 || shaderIdx == 7) {
+    tmp = float_mandel_unroll;
+  } else if (shaderIdx == 3 || shaderIdx == 8) {
+    tmp = double_mandel;
+  } else {
+    tmp = double_mandel_unroll;
+  }
+  std::string curr(tmp);
+  std::string searchString("MUL_ADD_INS");
+  std::string replaceString;
+  if (shaderIdx < 5) {
+    replaceString = "mad";
+  } else {
+    replaceString = "fma";
+  }
+
+  std::string::size_type pos = 0;
+  while ((pos = curr.find(searchString, pos)) != std::string::npos) {
+    curr.replace(pos, searchString.size(), replaceString);
+    pos++;
+  }
+
+  tmp = curr.c_str();
+
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&tmp, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  const char *buildOps = NULL;
+  if (p)
+    buildOps = "-DUSE_CL_AMD_FP64";
+  else if (p2)
+    buildOps = "-DUSE_CL_KHR_FP64";
+  error_ = _wrapper->clBuildProgram(program_, 1, &device, buildOps, NULL, NULL);
+
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "mandelbrot", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  coordIdx = _openTest % numCoords;
+  if ((shaderIdx == 0) || (shaderIdx == 1) || (shaderIdx == 2) ||
+      (shaderIdx == 5) || (shaderIdx == 6) || (shaderIdx == 7)) {
+    float xStep = (float)(coords[coordIdx].width / (double)width_);
+    float yStep = (float)(-coords[coordIdx].width / (double)width_);
+    float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
+    float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
+    error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                      (void *)&outBuffer_);
+    error_ =
+        _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_uint), (void *)&width_);
+    error_ =
+        _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_float), (void *)&xPos);
+    error_ =
+        _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_float), (void *)&yPos);
+    error_ =
+        _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_float), (void *)&xStep);
+    error_ =
+        _wrapper->clSetKernelArg(kernel_, 5, sizeof(cl_float), (void *)&yStep);
+    error_ =
+        _wrapper->clSetKernelArg(kernel_, 6, sizeof(cl_uint), (void *)&maxIter);
+  } else {
+    double xStep = coords[coordIdx].width / (double)width_;
+    double yStep = -coords[coordIdx].width / (double)width_;
+    double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
+    double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
+    error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                      (void *)&outBuffer_);
+    error_ =
+        _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_uint), (void *)&width_);
+    error_ =
+        _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_double), (void *)&xPos);
+    error_ =
+        _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_double), (void *)&yPos);
+    error_ =
+        _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_double), (void *)&xStep);
+    error_ =
+        _wrapper->clSetKernelArg(kernel_, 5, sizeof(cl_double), (void *)&yStep);
+    error_ =
+        _wrapper->clSetKernelArg(kernel_, 6, sizeof(cl_uint), (void *)&maxIter);
+  }
+  setData(outBuffer_, 0xdeadbeef);
+}
+
+void OCLPerfMandelbrot::run(void) {
+  if (skip) return;
+  int global = width_ * width_;
+  // We handle 4 pixels per thread
+  if ((shaderIdx == 1) || (shaderIdx == 6)) global >>= 2;
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  // Warm-up
+  error_ = _wrapper->clEnqueueNDRangeKernel(
+      cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+      (const size_t *)local_work_size, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  _wrapper->clFinish(cmd_queue_);
+
+  double totalTime = 0.0;
+
+  for (unsigned int k = 0; k < numLoops; k++) {
+    CPerfCounter timer;
+
+    timer.Reset();
+    timer.Start();
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+    _wrapper->clFinish(cmd_queue_);
+
+    timer.Stop();
+    double sec = timer.GetElapsedTime();
+    totalTime += sec;
+  }
+
+  checkData(outBuffer_);
+  // Compute GFLOPS.  There are 7 FLOPs per iteration
+  double perf = ((double)totalIters * 7 * (double)(1e-09)) /
+                (totalTime / (double)numLoops);
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " %s (GFLOPS) ", shaderStr[shaderIdx]);
+  testDescString = buf;
+  // Dump iteration count
+  // printf(" totalIter = %lld\n", totalIters);
+  if (isAMD && (type_ == CL_DEVICE_TYPE_GPU)) {
+    CHECK_RESULT((totalIters != expectedIters[_openTest]) &&
+                     (totalIters !=
+                      expectedIters[(_openTest < FMA_EXPECTEDVALUES_INDEX
+                                         ? _openTest + FMA_EXPECTEDVALUES_INDEX
+                                         : _openTest)]),
+                 "Incorrect iteration count detected!");
+  } else {
+    CHECK_RESULT(totalIters != expectedItersNV[_openTest],
+                 "Incorrect iteration count detected!");
+  }
+}
+
+unsigned int OCLPerfMandelbrot::close(void) {
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
+
+OCLPerfAsyncMandelbrot::OCLPerfAsyncMandelbrot() {}
+
+OCLPerfAsyncMandelbrot::~OCLPerfAsyncMandelbrot() {}
+
+void OCLPerfAsyncMandelbrot::open(unsigned int test, char *units,
+                                  double &conversion, unsigned int deviceId) {
+  // Create common items first
+  OCLPerfMandelbrot::open(test, units, conversion, deviceId);
+
+  // Create resources for async test
+  cmd_queue2_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue2_ == 0, "clCreateCommandQueue failed");
+
+  outBuffer2_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer2) failed");
+}
+
+void OCLPerfAsyncMandelbrot::run(void) {
+  if (skip) return;
+  int global = width_ * width_;
+  // We handle 4 pixels per thread
+  if ((shaderIdx == 1) || (shaderIdx == 6)) global >>= 2;
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  // Warm-up
+  error_ = _wrapper->clEnqueueNDRangeKernel(
+      cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+      (const size_t *)local_work_size, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  _wrapper->clFinish(cmd_queue_);
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                    (void *)&outBuffer2_);
+  error_ = _wrapper->clEnqueueNDRangeKernel(
+      cmd_queue2_, kernel_, 1, NULL, (const size_t *)global_work_size,
+      (const size_t *)local_work_size, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  _wrapper->clFinish(cmd_queue2_);
+
+  double totalTime = 0.0;
+
+  for (unsigned int k = 0; k < numLoops; k++) {
+    CPerfCounter timer;
+
+    timer.Reset();
+    timer.Start();
+    error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                      (void *)&outBuffer_);
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+    error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                      (void *)&outBuffer2_);
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue2_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+    _wrapper->clFlush(cmd_queue_);
+    _wrapper->clFlush(cmd_queue2_);
+    _wrapper->clFinish(cmd_queue_);
+    _wrapper->clFinish(cmd_queue2_);
+
+    timer.Stop();
+    double sec = timer.GetElapsedTime();
+    totalTime += sec;
+  }
+
+  checkData(outBuffer_);
+  checkData(outBuffer2_);
+  // Compute GFLOPS.  There are 7 FLOPs per iteration
+  double perf = ((double)(totalIters * 7) * (double)(1e-09)) /
+                (totalTime / (double)numLoops);
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " async %s (GFLOPS) ", shaderStr[shaderIdx]);
+  testDescString = buf;
+  // Dump iteration count
+  // printf(" totalIter = %lld\n", totalIters);
+  if (isAMD && (type_ == CL_DEVICE_TYPE_GPU)) {
+    CHECK_RESULT(
+        (totalIters != 2 * expectedIters[_openTest]) &&
+            (totalIters !=
+             2 * expectedIters[(_openTest < FMA_EXPECTEDVALUES_INDEX
+                                    ? _openTest + FMA_EXPECTEDVALUES_INDEX
+                                    : _openTest)]),
+        "Incorrect iteration count detected!");
+  } else {
+    CHECK_RESULT(totalIters != 2 * expectedItersNV[_openTest],
+                 "Incorrect iteration count detected!");
+  }
+}
+
+unsigned int OCLPerfAsyncMandelbrot::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+  _wrapper->clFinish(cmd_queue2_);
+
+  // Clean up async test items
+  if (outBuffer2_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer2_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer2_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue2_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  // Clean up the rest
+  return OCLPerfMandelbrot::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.h
new file mode 100644
index 0000000000..f810801038
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_Mandelbrot_H_
+#define _OCL_Mandelbrot_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfMandelbrot : public OCLTestImp {
+ public:
+  OCLPerfMandelbrot();
+  virtual ~OCLPerfMandelbrot();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void setData(cl_mem buffer, unsigned int data);
+  void checkData(cl_mem buffer);
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_mem outBuffer_;
+  cl_int error_;
+  cl_device_id device;
+
+  unsigned int width_;
+  unsigned int bufSize_;
+  bool doubleSupport;
+  bool skip;
+  unsigned int maxIter;
+  unsigned int shaderIdx;
+  unsigned int coordIdx;
+  unsigned long long totalIters;
+  bool isAMD;
+  static const unsigned int numLoops = 10;
+};
+
+class OCLPerfAsyncMandelbrot : public OCLPerfMandelbrot {
+ public:
+  OCLPerfAsyncMandelbrot();
+  virtual ~OCLPerfAsyncMandelbrot();
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  cl_command_queue cmd_queue2_;
+  cl_mem outBuffer2_;
+};
+
+#endif  // _OCL_Mandelbrot_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferReadSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferReadSpeed.cpp
new file mode 100644
index 0000000000..74618f5b46
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferReadSpeed.cpp
@@ -0,0 +1,262 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfMapBufferReadSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 4
+// 256KB, 1 MB, 4MB, 16 MB
+static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304,
+                                              16777216};
+
+static const unsigned int Iterations[2] = {1,
+                                           OCLPerfMapBufferReadSpeed::NUM_ITER};
+#define NUM_OFFSETS 1
+static const unsigned int offsets[NUM_OFFSETS] = {0};
+#define NUM_SUBTESTS (3 + NUM_OFFSETS)
+OCLPerfMapBufferReadSpeed::OCLPerfMapBufferReadSpeed() {
+  _numSubTests = NUM_SIZES * NUM_SUBTESTS * 2;
+}
+
+OCLPerfMapBufferReadSpeed::~OCLPerfMapBufferReadSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfMapBufferReadSpeed::open(unsigned int test, char *units,
+                                     double &conversion,
+                                     unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  outBuffer_ = 0;
+  persistent = false;
+  allocHostPtr = false;
+  useHostPtr = false;
+  hostMem = NULL;
+  alignedMem = NULL;
+  alignment = 4096;
+  isAMD = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        isAMD = true;
+      }
+      // platform = platforms[_platformIndex];
+      // break;
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  bufSize_ = Sizes[_openTest % NUM_SIZES];
+  if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 2) {
+    useHostPtr = true;
+    offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 3];
+  } else if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 2) && isAMD) {
+    persistent = true;
+  } else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 1) {
+    allocHostPtr = true;
+  }
+
+  numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS)];
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_WRITE_ONLY;
+  if (persistent) {
+    flags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
+  } else if (allocHostPtr) {
+    flags |= CL_MEM_ALLOC_HOST_PTR;
+  } else if (useHostPtr) {
+    flags |= CL_MEM_USE_HOST_PTR;
+    hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset);
+    CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed");
+    alignedMem =
+        (char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) +
+                 offset);
+  }
+  outBuffer_ =
+      _wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  // Force memory to be on GPU, if possible
+  {
+    cl_mem memBuffer =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed");
+
+    _wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, outBuffer_, 0, 0,
+                                  bufSize_, 0, NULL, NULL);
+    _wrapper->clFinish(cmd_queue_);
+
+    _wrapper->clReleaseMemObject(memBuffer);
+  }
+}
+
+void OCLPerfMapBufferReadSpeed::run(void) {
+  CPerfCounter timer;
+
+  void *mem;
+  // Warm up
+  mem =
+      _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE, CL_MAP_READ,
+                                   0, bufSize_, 0, NULL, NULL, &error_);
+
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0,
+                                             NULL, NULL);
+  CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed");
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < numIter; i++) {
+    mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE,
+                                       CL_MAP_READ, 0, bufSize_, 0, NULL, NULL,
+                                       &error_);
+
+    CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+    error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0,
+                                               NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed");
+    error_ = _wrapper->clFinish(cmd_queue_);
+    CHECK_RESULT(error_, "clFinish failed");
+  }
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Map read bandwidth in GB/s
+  double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
+
+  if (persistent || allocHostPtr) {
+    _perfInfo = (float)(sec / numIter) * 1000000.0f;  // Get us per map
+  } else {
+    _perfInfo = (float)perf;
+  }
+  char str[256];
+  if (persistent) {
+    SNPRINTF(str, sizeof(str), "PERSISTENT (us)");
+  } else if (allocHostPtr) {
+    SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (us)");
+  } else if (useHostPtr) {
+    SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset);
+  } else {
+    SNPRINTF(str, sizeof(str), "(GB/s)");
+  }
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) i: %4d %29s ", bufSize_, numIter,
+           str);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfMapBufferReadSpeed::close(void) {
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+  if (hostMem) {
+    free(hostMem);
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferReadSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferReadSpeed.h
new file mode 100644
index 0000000000..4017061d79
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferReadSpeed.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_MapBufferReadSpeed_H_
+#define _OCL_MapBufferReadSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfMapBufferReadSpeed : public OCLTestImp {
+ public:
+  OCLPerfMapBufferReadSpeed();
+  virtual ~OCLPerfMapBufferReadSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 1000;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int bufSize_;
+  bool persistent;
+  bool allocHostPtr;
+  bool useHostPtr;
+  unsigned int numIter;
+  char* hostMem;
+  char* alignedMem;
+  size_t alignment;
+  unsigned int offset;
+  bool isAMD;
+};
+
+#endif  // _OCL_MapBufferReadSpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferWriteSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferWriteSpeed.cpp
new file mode 100644
index 0000000000..dd12ded6d4
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferWriteSpeed.cpp
@@ -0,0 +1,291 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfMapBufferWriteSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 4
+// 256KB, 1 MB, 4MB, 16 MB
+static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304,
+                                              16777216};
+
+static const unsigned int Iterations[2] = {
+    1, OCLPerfMapBufferWriteSpeed::NUM_ITER};
+#define NUM_OFFSETS 1
+static const unsigned int offsets[NUM_OFFSETS] = {0};
+#define NUM_SUBTESTS (3 + NUM_OFFSETS)
+OCLPerfMapBufferWriteSpeed::OCLPerfMapBufferWriteSpeed() {
+  _numSubTests = NUM_SIZES * NUM_SUBTESTS * 3;
+}
+
+OCLPerfMapBufferWriteSpeed::~OCLPerfMapBufferWriteSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfMapBufferWriteSpeed::open(unsigned int test, char *units,
+                                      double &conversion,
+                                      unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  outBuffer_ = 0;
+  persistent = false;
+  allocHostPtr = false;
+  useHostPtr = false;
+  hostMem = NULL;
+  alignedMem = NULL;
+  alignment = 4096;
+  isAMD = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        isAMD = true;
+      }
+      // platform = platforms[_platformIndex];
+      // break;
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  char getVersion[128];
+  error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION,
+                                       sizeof(getVersion), getVersion, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+  platformVersion[0] = getVersion[7];
+  platformVersion[1] = getVersion[8];
+  platformVersion[2] = getVersion[9];
+  platformVersion[3] = '\0';
+
+  bufSize_ = Sizes[_openTest % NUM_SIZES];
+  if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 2) {
+    useHostPtr = true;
+    offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 3];
+  } else if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 2) && isAMD) {
+    persistent = true;
+  } else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 1) {
+    allocHostPtr = true;
+  }
+
+  numIter = Iterations[std::min(_openTest / (NUM_SIZES * NUM_SUBTESTS), 1u)];
+
+  if (_openTest < NUM_SIZES * NUM_SUBTESTS * 2) {
+    mapFlags = CL_MAP_WRITE;
+  } else {
+    mapFlags = CL_MAP_WRITE_INVALIDATE_REGION;
+  }
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_READ_ONLY;
+  if (persistent) {
+    flags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
+  } else if (allocHostPtr) {
+    flags |= CL_MEM_ALLOC_HOST_PTR;
+  } else if (useHostPtr) {
+    flags |= CL_MEM_USE_HOST_PTR;
+    hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset);
+    CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed");
+    alignedMem =
+        (char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) +
+                 offset);
+  }
+  outBuffer_ =
+      _wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  // Force memory to be on GPU if possible
+  {
+    cl_mem memBuffer =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed");
+
+    _wrapper->clEnqueueCopyBuffer(cmd_queue_, outBuffer_, memBuffer, 0, 0,
+                                  bufSize_, 0, NULL, NULL);
+    _wrapper->clFinish(cmd_queue_);
+
+    _wrapper->clReleaseMemObject(memBuffer);
+  }
+}
+
+void OCLPerfMapBufferWriteSpeed::run(void) {
+  CPerfCounter timer;
+
+  if (_openTest >= NUM_SIZES * NUM_SUBTESTS * 2) {
+    // Skip CL_MAP_WRITE_INVALIDATE_REGION testing for 1.0 and 1.1 platforms
+    if ((platformVersion[0] == '1') &&
+        ((platformVersion[2] == '0') || (platformVersion[2] == '1'))) {
+      char buf[256];
+      SNPRINTF(buf, sizeof(buf), " SKIPPED ");
+      testDescString = buf;
+      return;
+    }
+  }
+  void *mem;
+  // Warm up
+  mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE, mapFlags,
+                                     0, bufSize_, 0, NULL, NULL, &error_);
+
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0,
+                                             NULL, NULL);
+  CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed");
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < numIter; i++) {
+    mem =
+        _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE, mapFlags,
+                                     0, bufSize_, 0, NULL, NULL, &error_);
+
+    CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+    error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0,
+                                               NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed");
+    error_ = _wrapper->clFinish(cmd_queue_);
+    CHECK_RESULT(error_, "clFinish failed");
+  }
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Map write bandwidth in GB/s
+  double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
+
+  if (persistent || allocHostPtr) {
+    _perfInfo = (float)(sec / numIter) * 1000000.0f;  // Get us per map
+  } else {
+    _perfInfo = (float)perf;
+  }
+  char str[256];
+  if (persistent) {
+    SNPRINTF(str, sizeof(str), "PERSISTENT (us)");
+  } else if (allocHostPtr) {
+    SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (us)");
+  } else if (useHostPtr) {
+    SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset);
+  } else {
+    SNPRINTF(str, sizeof(str), "(GB/s)");
+  }
+  char str2[256];
+  if (mapFlags == CL_MAP_WRITE_INVALIDATE_REGION) {
+    SNPRINTF(str2, sizeof(str2), "INV_REG %29s", str);
+  } else {
+    SNPRINTF(str2, sizeof(str2), "%29s", str);
+  }
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) i: %4d %37s ", bufSize_, numIter,
+           str2);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfMapBufferWriteSpeed::close(void) {
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+  if (hostMem) {
+    free(hostMem);
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferWriteSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferWriteSpeed.h
new file mode 100644
index 0000000000..ebcf8dc7d4
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferWriteSpeed.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_MapBufferWriteSpeed_H_
+#define _OCL_MapBufferWriteSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfMapBufferWriteSpeed : public OCLTestImp {
+ public:
+  OCLPerfMapBufferWriteSpeed();
+  virtual ~OCLPerfMapBufferWriteSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 1000;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int bufSize_;
+  bool persistent;
+  bool allocHostPtr;
+  bool useHostPtr;
+  unsigned int numIter;
+  char* hostMem;
+  char* alignedMem;
+  size_t alignment;
+  unsigned int offset;
+  bool isAMD;
+  cl_map_flags mapFlags;
+  char platformVersion[32];
+};
+
+#endif  // _OCL_MapBufferWriteSpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageReadSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageReadSpeed.cpp
new file mode 100644
index 0000000000..16572c5156
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageReadSpeed.cpp
@@ -0,0 +1,213 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfMapImageReadSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 4
+static const unsigned int Sizes[NUM_SIZES] = {256, 512, 1024, 2048};
+
+#define NUM_FORMATS 1
+static const cl_image_format formats[NUM_FORMATS] = {
+    {CL_RGBA, CL_UNSIGNED_INT8}};
+static const char *textFormats[NUM_FORMATS] = {"R8G8B8A8"};
+static const unsigned int formatSize[NUM_FORMATS] = {4};
+
+static const unsigned int Iterations[2] = {1,
+                                           OCLPerfMapImageReadSpeed::NUM_ITER};
+
+OCLPerfMapImageReadSpeed::OCLPerfMapImageReadSpeed() {
+  _numSubTests = NUM_SIZES * NUM_FORMATS * 2;
+}
+
+OCLPerfMapImageReadSpeed::~OCLPerfMapImageReadSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfMapImageReadSpeed::open(unsigned int test, char *units,
+                                    double &conversion, unsigned int deviceId) {
+  cl_uint typeOfDevice = type_;
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  outBuffer_ = 0;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice,
+                                      0, NULL, &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    // if (num_devices > 0)
+    //{
+    //    platform = platforms[_platformIndex];
+    //    break;
+    //}
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+
+  bufSize_ = Sizes[_openTest % NUM_SIZES];
+  bufnum_ = (_openTest / NUM_SIZES) % NUM_FORMATS;
+  numIter = Iterations[_openTest / (NUM_SIZES * NUM_FORMATS)];
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices,
+                                    devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_WRITE_ONLY;
+  outBuffer_ = _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_],
+                                         bufSize_, bufSize_, 0, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateImage(outBuffer) failed");
+}
+
+void OCLPerfMapImageReadSpeed::run(void) {
+  CPerfCounter timer;
+  void *mem;
+
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {bufSize_, bufSize_, 1};
+  size_t image_row_pitch;
+  size_t image_slice_pitch;
+  // Warm up
+  mem = _wrapper->clEnqueueMapImage(
+      cmd_queue_, outBuffer_, CL_TRUE, CL_MAP_READ, origin, region,
+      &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_);
+
+  CHECK_RESULT(error_, "clEnqueueMapImage failed");
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0,
+                                             NULL, NULL);
+  CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed");
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < numIter; i++) {
+    mem = _wrapper->clEnqueueMapImage(
+        cmd_queue_, outBuffer_, CL_TRUE, CL_MAP_READ, origin, region,
+        &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_);
+
+    CHECK_RESULT(error_, "clEnqueueMapImage failed");
+    error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0,
+                                               NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed");
+    error_ = _wrapper->clFinish(cmd_queue_);
+    CHECK_RESULT(error_, "clFinish failed");
+  }
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Image map read bandwidth in GB/s
+  double perf = ((double)bufSize_ * bufSize_ * formatSize[bufnum_] * numIter *
+                 (double)(1e-09)) /
+                sec;
+
+  _perfInfo = (float)perf;
+
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s i: %4d (GB/s) ", bufSize_,
+           bufSize_, textFormats[bufnum_], numIter);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfMapImageReadSpeed::close(void) {
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageReadSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageReadSpeed.h
new file mode 100644
index 0000000000..509075fc41
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageReadSpeed.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_MapImageReadSpeed_H_
+#define _OCL_MapImageReadSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfMapImageReadSpeed : public OCLTestImp {
+ public:
+  OCLPerfMapImageReadSpeed();
+  virtual ~OCLPerfMapImageReadSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 100;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int bufSize_;
+  unsigned int bufnum_;
+  unsigned int numIter;
+};
+
+#endif  // _OCL_MapImageReadSpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageWriteSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageWriteSpeed.cpp
new file mode 100644
index 0000000000..9c0a4bd87e
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageWriteSpeed.cpp
@@ -0,0 +1,214 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfMapImageWriteSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 4
+static const unsigned int Sizes[NUM_SIZES] = {256, 512, 1024, 2048};
+
+#define NUM_FORMATS 1
+static const cl_image_format formats[NUM_FORMATS] = {
+    {CL_RGBA, CL_UNSIGNED_INT8}};
+static const char *textFormats[NUM_FORMATS] = {"R8G8B8A8"};
+static const unsigned int formatSize[NUM_FORMATS] = {4};
+
+static const unsigned int Iterations[2] = {1,
+                                           OCLPerfMapImageWriteSpeed::NUM_ITER};
+
+OCLPerfMapImageWriteSpeed::OCLPerfMapImageWriteSpeed() {
+  _numSubTests = NUM_SIZES * NUM_FORMATS * 2;
+}
+
+OCLPerfMapImageWriteSpeed::~OCLPerfMapImageWriteSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfMapImageWriteSpeed::open(unsigned int test, char *units,
+                                     double &conversion,
+                                     unsigned int deviceId) {
+  cl_uint typeOfDevice = type_;
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  outBuffer_ = 0;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice,
+                                      0, NULL, &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    // if (num_devices > 0)
+    //{
+    //    platform = platforms[_platformIndex];
+    //    break;
+    //}
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+
+  bufSize_ = Sizes[_openTest % NUM_SIZES];
+  bufnum_ = (_openTest / NUM_SIZES) % NUM_FORMATS;
+  numIter = Iterations[_openTest / (NUM_SIZES * NUM_FORMATS)];
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices,
+                                    devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_READ_ONLY;
+  outBuffer_ = _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_],
+                                         bufSize_, bufSize_, 0, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateImage(outBuffer) failed");
+}
+
+void OCLPerfMapImageWriteSpeed::run(void) {
+  CPerfCounter timer;
+
+  void *mem;
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {bufSize_, bufSize_, 1};
+  size_t image_row_pitch;
+  size_t image_slice_pitch;
+  // Warm up
+  mem = _wrapper->clEnqueueMapImage(
+      cmd_queue_, outBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region,
+      &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_);
+
+  CHECK_RESULT(error_, "clEnqueueMapImage failed");
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0,
+                                             NULL, NULL);
+  CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed");
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < numIter; i++) {
+    mem = _wrapper->clEnqueueMapImage(
+        cmd_queue_, outBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region,
+        &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_);
+
+    CHECK_RESULT(error_, "clEnqueueMapImage failed");
+    error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0,
+                                               NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed");
+    error_ = _wrapper->clFinish(cmd_queue_);
+    CHECK_RESULT(error_, "clFinish failed");
+  }
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Image map write bandwidth in GB/s
+  double perf = ((double)bufSize_ * bufSize_ * formatSize[bufnum_] * numIter *
+                 (double)(1e-09)) /
+                sec;
+
+  _perfInfo = (float)perf;
+
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s i: %4d (GB/s) ", bufSize_,
+           bufSize_, textFormats[bufnum_], numIter);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfMapImageWriteSpeed::close(void) {
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageWriteSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageWriteSpeed.h
new file mode 100644
index 0000000000..0e05b4a3a2
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageWriteSpeed.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_MapImageWriteSpeed_H_
+#define _OCL_MapImageWriteSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfMapImageWriteSpeed : public OCLTestImp {
+ public:
+  OCLPerfMapImageWriteSpeed();
+  virtual ~OCLPerfMapImageWriteSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 100;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int bufSize_;
+  unsigned int bufnum_;
+  unsigned int numIter;
+};
+
+#endif  // _OCL_MapImageWriteSpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMatrixTranspose.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMatrixTranspose.cpp
new file mode 100644
index 0000000000..7c8bae1d13
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMatrixTranspose.cpp
@@ -0,0 +1,326 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfMatrixTranspose.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+static const unsigned int NUM_BLOCK_SIZES = 2;
+static const unsigned int blockSizes[NUM_BLOCK_SIZES] = {8, 16};
+static const unsigned int NUM_MATRIX_DIMS = 2;
+static const unsigned int matrixDims[NUM_MATRIX_DIMS] = {1024, 1920};
+static const char *matrixtranspose_kernel =
+    "kernel void matrixTranspose(global uint *restrict inBuf, global uint "
+    "*restrict outBuf, local uint *localBuf, uint blockSize, uint width, uint "
+    "height)\n"
+    "{\n"
+    "    uint globalIdx = get_global_id(0);\n"
+    "    uint globalIdy = get_global_id(1);\n"
+
+    "    uint localIdx = get_local_id(0);\n"
+    "    uint localIdy = get_local_id(1);\n"
+
+    "    /* copy from input to local memory */\n"
+    "    /* Note that we transpose the x and y coordinates when storing */\n"
+    "    localBuf[localIdx*blockSize + localIdy] = inBuf[globalIdy*width + "
+    "globalIdx];\n"
+
+    "    /* wait until the whole block is filled */\n"
+    "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+
+    "    uint groupIdx = get_group_id(0);\n"
+    "    uint groupIdy = get_group_id(1);\n"
+
+    "    /* calculate the corresponding target location for transpose  by "
+    "inverting x and y values*/\n"
+    "    /* Here we don't swap localIdx and localIdy, this is to get larger "
+    "bursts when threads write to memory. */\n"
+    "    /* To make this work, we've swapped the coordinates when we write to "
+    "local memory. */\n"
+    "    uint targetGlobalIdx = groupIdy*blockSize + localIdx;\n"
+    "    uint targetGlobalIdy = groupIdx*blockSize + localIdy;\n"
+
+    "    /* calculate the corresponding raster indices of source and target "
+    "*/\n"
+    "    uint targetIndex  = targetGlobalIdy*height     + targetGlobalIdx;\n"
+    "    uint sourceIndex  = localIdy       * blockSize + localIdx;\n"
+
+    "    outBuf[targetIndex] = localBuf[sourceIndex];\n"
+    "}\n";
+
+OCLPerfMatrixTranspose::OCLPerfMatrixTranspose() {
+  _numSubTests = NUM_BLOCK_SIZES * NUM_MATRIX_DIMS;
+}
+
+OCLPerfMatrixTranspose::~OCLPerfMatrixTranspose() {}
+
+void OCLPerfMatrixTranspose::setData(cl_mem buffer) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  for (unsigned int i = 0; i < height_; i++) {
+    for (unsigned int j = 0; j < width_; j++) {
+      *(data + i * width_ + j) = i * width_ + j;
+    }
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+}
+
+void OCLPerfMatrixTranspose::fillData(cl_mem buffer, unsigned int val) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  for (unsigned int i = 0; i < width_ * height_; i++) {
+    data[i] = val;
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+}
+
+void OCLPerfMatrixTranspose::checkData(cl_mem buffer) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  bool err = false;
+  for (unsigned int i = 0; (i < width_) && !err; i++) {
+    for (unsigned int j = 0; (j < height_) && !err; j++) {
+      if (*(data + i * height_ + j) != (j * width_ + i)) {
+        printf("Data mismatch at (%d, %d)!  Got %d, expected %d\n", j, i,
+               *(data + i * height_ + j), j * width_ + i);
+        err = true;
+        break;
+      }
+    }
+    break;
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfMatrixTranspose::open(unsigned int test, char *units,
+                                  double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  inBuffer_ = 0;
+  outBuffer_ = 0;
+
+  blockSize_ = blockSizes[_openTest % NUM_BLOCK_SIZES];
+  width_ = matrixDims[_openTest / NUM_BLOCK_SIZES];
+  height_ = width_;
+  // We compute a square domain
+  bufSize_ = width_ * height_ * sizeof(cl_uint);
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      // platform = platforms[_platformIndex];
+      // break;
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0,
+               "Couldn't find platform with GPU devices, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  inBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, bufSize_,
+                                       NULL, &error_);
+  CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed");
+  setData(inBuffer_);
+
+  outBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, bufSize_,
+                                        NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+  fillData(outBuffer_, 0xdeadbeef);
+
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&matrixtranspose_kernel, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  char *buildOps = NULL;
+  error_ = _wrapper->clBuildProgram(program_, 1, &device, buildOps, NULL, NULL);
+
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "matrixTranspose", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&inBuffer_);
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_);
+  error_ = _wrapper->clSetKernelArg(
+      kernel_, 2, sizeof(cl_uint) * blockSize_ * blockSize_, NULL);
+  error_ = _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_uint),
+                                    (void *)&blockSize_);
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_uint), (void *)&width_);
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 5, sizeof(cl_uint), (void *)&height_);
+}
+
+void OCLPerfMatrixTranspose::run(void) {
+  size_t global_work_size[2] = {width_, height_};
+  size_t local_work_size[2] = {blockSize_, blockSize_};
+
+  CPerfCounter timer;
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < MAX_ITERATIONS; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 2, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+  }
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  checkData(outBuffer_);
+  // Compute GB/s
+  double perf =
+      ((double)bufSize_ * (double)MAX_ITERATIONS * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  testDescString = "";
+  char str[64];
+  sprintf(str, "(%d,%d) matrix with (%2d,%2d) block size %fms (GB/s) ", width_,
+          height_, blockSize_, blockSize_,
+          (sec / (double)MAX_ITERATIONS) * 1000.);
+  testDescString += str;
+}
+
+unsigned int OCLPerfMatrixTranspose::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+
+  if (inBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(inBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(inBuffer_) failed");
+  }
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMatrixTranspose.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMatrixTranspose.h
new file mode 100644
index 0000000000..ac5c875162
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMatrixTranspose.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_MATRIX_TRANSPOSE_H_
+#define _OCL_MATRIX_TRANSPOSE_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfMatrixTranspose : public OCLTestImp {
+ public:
+  OCLPerfMatrixTranspose();
+  virtual ~OCLPerfMatrixTranspose();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void setData(cl_mem buffer);
+  void fillData(cl_mem buffer, unsigned int data);
+  void checkData(cl_mem buffer);
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_mem inBuffer_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int width_;
+  unsigned int height_;
+  unsigned int bufSize_;
+  unsigned int blockSize_;
+  static const unsigned int MAX_ITERATIONS = 50;
+};
+
+#endif  // _OCL_MATRIX_TRANSPOSE_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCombine.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCombine.cpp
new file mode 100644
index 0000000000..057b9e3d25
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCombine.cpp
@@ -0,0 +1,234 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfMemCombine.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+struct TestParams {
+  const char* type;
+  unsigned int numCombine;
+  unsigned int assignSize;
+};
+
+TestParams testParams[]
+    // char type causes shader compiler to crash. reenable once get a fix for
+    // the shader compiler
+    //= {{"char", 16}, {"short", 8}, {"int", 4}, {"long", 4}, {"float", 4}};
+    //= {{"char", 16, 1}, {"short", 8, 2}, {"int", 4, 4}, {"long", 4, 8},
+    = {{"short", 8, 2},  {"int", 4, 4},      {"long", 4, 8},   {"float", 4, 4},
+       {"char4", 4, 4},  {"uchar16", 4, 16}, {"short2", 4, 4}, {"int2", 4, 8},
+       {"uint4", 4, 16}, {"long2", 4, 16},   {"float2", 4, 8}};
+
+const int numTests = sizeof(testParams) / sizeof(TestParams);
+
+// Generate a kernel that does array loads and stores, which should be combined
+// by MemCombine
+void genCombineVLoadVStores(const char* type, int loopSize, int numCombine,
+                            char* ret) {
+  sprintf(ret,
+          "__kernel void combine_vload_vstores(__global %s"
+          " * restrict src, __global %s *result) {\n",
+          type, type);
+  strcat(ret, "  int id = get_global_id(0);\n");
+  strcat(ret, "  int gsize = get_global_size(0);\n");
+  char buf[256];
+  sprintf(buf, "  for (int i = 0; i < %d; i+=gsize) {\n", loopSize);
+  strcat(ret, buf);
+  sprintf(buf, "    int j = (i+id) * %d;\n", numCombine);
+  strcat(ret, buf);
+  for (int i = 0; i < numCombine; ++i) {
+    sprintf(buf, "    result[j+%d] = src[j+%d];\n", i, i);
+    strcat(ret, buf);
+  }
+  strcat(ret, "  }\n}\n");
+}
+
+void OCLPerfMemCombine::setData(cl_mem buffer, unsigned int bufSize,
+                                unsigned char val) {
+  unsigned char* data = (unsigned char*)_wrapper->clEnqueueMapBuffer(
+      cmdQueues_[0], buffer, true, CL_MAP_WRITE, 0, bufSize, 0, NULL, NULL,
+      &error_);
+  for (unsigned int i = 0; i < bufSize; ++i) data[i] = val;
+
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[0], buffer, data, 0,
+                                             NULL, NULL);
+  _wrapper->clFinish(cmdQueues_[0]);
+}
+
+void print1Darray(unsigned char* buffer, unsigned int bufSize) {
+  for (unsigned int i = 0; i < bufSize; ++i) {
+    if (i % 32 == 0) printf("\n");
+    printf("%d ", buffer[i]);
+  }
+  printf("\n");
+}
+
+void OCLPerfMemCombine::checkData(cl_mem buffer, unsigned int bufSize,
+                                  unsigned int limit, unsigned char defVal) {
+  unsigned char* data = (unsigned char*)_wrapper->clEnqueueMapBuffer(
+      cmdQueues_[0], buffer, true, CL_MAP_READ, 0, bufSize, 0, NULL, NULL,
+      &error_);
+  for (unsigned int i = 0; i < bufSize; i++) {
+    unsigned char expected;
+    if (i < limit) {
+      expected = 1U;
+    } else {
+      expected = defVal;
+    }
+    if (data[i] != expected) {
+      printf("at index %d:\n", i);
+      print1Darray(&data[i], 16);
+      CHECK_RESULT(1, "incorrect output data detected!");
+      break;
+    }
+  }
+
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[0], buffer, data, 0,
+                                             NULL, NULL);
+  _wrapper->clFinish(cmdQueues_[0]);
+}
+
+OCLPerfMemCombine::OCLPerfMemCombine() { _numSubTests = numTests; }
+
+OCLPerfMemCombine::~OCLPerfMemCombine() {}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLPerfMemCombine::open(unsigned int test, char* units, double& conversion,
+                             unsigned int deviceId) {
+  _openTest = test;
+
+  context_ = 0;
+  kernel_ = NULL;
+  program_ = NULL;
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+
+  cl_mem inBuffer =
+      _wrapper->clCreateBuffer(context_, 0, inSize_, NULL, &error_);
+  CHECK_RESULT(inBuffer == 0, "clCreateBuffer(inBuffer) failed");
+  buffers_.push_back(inBuffer);
+
+  cl_mem outBuffer =
+      _wrapper->clCreateBuffer(context_, 0, outSize_, NULL, &error_);
+  CHECK_RESULT(outBuffer == 0, "clCreateBuffer(outBuffer) failed");
+  buffers_.push_back(outBuffer);
+
+  createKernel(testParams[test].type, testParams[test].numCombine);
+  setData(inBuffer, inSize_, 1U);
+  setData(outBuffer, outSize_, 0);
+  dataRange_ = loopSize_ * numCombine_ * testParams[test].assignSize;
+}
+
+void OCLPerfMemCombine::createKernel(const char* type, int numCombine) {
+  dataType_ = type;
+  numCombine_ = numCombine;
+
+  /////////////////////////////////////////////////////////////////
+  // Load CL file, build CL program object, create CL kernel object
+  /////////////////////////////////////////////////////////////////
+  char source[1024];
+  genCombineVLoadVStores(type, loopSize_, numCombine, source);
+  size_t sourceSize[] = {strlen(source)};
+  const char* src = &source[0];
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &src, sourceSize,
+                                                 &error_);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clCreateProgramWithSource failed");
+
+  /* create a cl program executable for all the devices specified */
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError = _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                               CL_PROGRAM_BUILD_LOG,
+                                               16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+    return;
+  }
+
+  /* get a kernel object handle for a kernel with the given name */
+  const char* kernelName = "combine_vload_vstores";
+  kernel_ = _wrapper->clCreateKernel(program_, kernelName, &error_);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clCreateProgramWithSource failed");
+
+  /*** Set appropriate arguments to the kernel ***/
+  /* the input array to the kernel */
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                    (void*)&buffers()[0]);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
+
+  /* the output array to the kernel */
+  error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem),
+                                    (void*)&buffers()[1]);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
+}
+
+void OCLPerfMemCombine::run(void) {
+  size_t globalThreads[1];
+  size_t localThreads[1];
+
+  globalThreads[0] = 64;
+  localThreads[0] = 64;
+
+  CPerfCounter timer;
+  timer.Reset();
+  timer.Start();
+
+  for (unsigned int i = 0; i < NUM_ITER; ++i) {
+    /*
+     * Enqueue a kernel run call.
+     */
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[0], kernel_, 1, NULL,
+                                              globalThreads, localThreads, 0,
+                                              NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+  _wrapper->clFinish(cmdQueues_[0]);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), "%d %-8s (sec)", numCombine_, dataType_);
+  testDescString = buf;
+  _perfInfo = (float)sec;
+
+  checkData(buffers()[1], outSize_, dataRange_, 0);
+  return;
+}
+
+unsigned int OCLPerfMemCombine::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCombine.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCombine.h
new file mode 100644
index 0000000000..6c7225ceca
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCombine.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_MemCombine_H_
+#define _OCL_MemCombine_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfMemCombine : public OCLTestImp {
+  enum { inSize_ = 4096U * 1024U };
+  enum { outSize_ = 4096U * 1024U };
+  enum { loopSize_ = 8192 };
+
+ public:
+  OCLPerfMemCombine();
+  virtual ~OCLPerfMemCombine();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 1000;
+
+  const char* dataType_;
+  unsigned int numCombine_;
+  unsigned int dataRange_;
+  unsigned char input[inSize_];
+  unsigned char output[outSize_];
+
+ private:
+  void createKernel(const char* type, int numCombine);
+  void setData(cl_mem buffer, unsigned int bufSize, unsigned char val);
+  void checkData(cl_mem buffer, unsigned int bufSize, unsigned int limit,
+                 unsigned char defVal);
+};
+
+#endif  // _OCL_MemCombine_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCreate.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCreate.cpp
new file mode 100644
index 0000000000..d58b8bf381
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCreate.cpp
@@ -0,0 +1,176 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfMemCreate.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include <sstream>
+#include <string>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+#if defined(_WIN32) && !defined(_WIN64)
+static const size_t BufSize = 0x200000;
+static const size_t BufSizeC = 0x100000;
+#else
+static const size_t BufSize = 0x400000;
+static const size_t BufSizeC = 0x200000;
+#endif
+
+static const size_t Iterations = 0x100;
+static const size_t IterationsC = 0x1000;
+
+static const char* strKernel =
+    "__kernel void dummy(__global uint* out)    \n"
+    "{                                          \n"
+    "   uint id = get_global_id(0);             \n"
+    "   uint value = 1;                         \n"
+    "   if ((int)get_local_id(0) < 0)           \n"
+    "       out[id] = value;                    \n"
+    "}                                          \n";
+
+#define NUM_TESTS 5
+OCLPerfMemCreate::OCLPerfMemCreate() {
+  _numSubTests = NUM_TESTS * 2;
+  failed_ = false;
+}
+
+OCLPerfMemCreate::~OCLPerfMemCreate() {}
+
+void OCLPerfMemCreate::open(unsigned int test, char* units, double& conversion,
+                            unsigned int deviceId) {
+  _deviceId = deviceId;
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  test_ = test % NUM_TESTS;
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  useSubBuf_ = (test >= NUM_TESTS);
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    printf("GPU device is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+  kernel_ = _wrapper->clCreateKernel(program_, "dummy", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLPerfMemCreate::run(void) {
+  if (failed_) {
+    return;
+  }
+  cl_mem buffer, subBuf;
+  cl_mem* bufptr;
+  unsigned int* values;
+  values = reinterpret_cast<unsigned int*>(new cl_int4[BufSize]);
+  CPerfCounter timer;
+  cl_mem_flags flags = CL_MEM_READ_ONLY;
+  void* hostPtr = NULL;
+
+  // Clear destination buffer
+  memset(values, 0, BufSize * sizeof(cl_int4));
+
+  size_t bufSize = ((test_ % 2) == 0) ? BufSize * sizeof(cl_int4)
+                                      : BufSizeC * sizeof(cl_int4);
+  size_t iter = ((test_ % 2) == 0) ? Iterations : IterationsC;
+
+  if (test_ == 4) {
+    hostPtr = values;
+    bufSize = 0x100000;
+    flags = CL_MEM_USE_HOST_PTR;
+  } else if ((test_ / 2) > 0) {
+    iter = ((test_ % 2) == 0) ? Iterations / 10 : IterationsC;
+    flags |= CL_MEM_ALLOC_HOST_PTR;
+  }
+  timer.Reset();
+  timer.Start();
+
+  for (size_t i = 0; i < iter; ++i) {
+    buffer =
+        _wrapper->clCreateBuffer(context_, flags, bufSize, hostPtr, &error_);
+    bufptr = &buffer;
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+    if (useSubBuf_) {
+      cl_buffer_region reg;
+      reg.origin = 0;
+      reg.size = bufSize;
+      subBuf = _wrapper->clCreateSubBuffer(
+          buffer, flags, CL_BUFFER_CREATE_TYPE_REGION, &reg, &error_);
+      bufptr = &subBuf;
+      CHECK_RESULT((error_ != CL_SUCCESS), "clCreateSubBuffer() failed");
+    }
+
+    error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), bufptr);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+    size_t gws[1] = {64};
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, gws, NULL, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+    _wrapper->clFinish(cmdQueues_[_deviceId]);
+    if (useSubBuf_) _wrapper->clReleaseMemObject(subBuf);
+    _wrapper->clReleaseMemObject(buffer);
+  }
+
+  timer.Stop();
+  std::stringstream stream;
+
+  static const char* Message[] = {" create+destroy time [uncached] ",
+                                  " create+destroy time [cached  ] "};
+  static const char* Type[] = {"DEV", "AHP", "UHP"};
+
+  stream << Type[test_ / 2];
+  stream << Message[test_ % 2];
+  stream << " per allocation (ms) ";
+  stream << bufSize / 1024 << " KB";
+  if (useSubBuf_) stream << " subbuf ";
+  testDescString = stream.str();
+  _perfInfo = static_cast<float>(timer.GetElapsedTime() * 1000 / iter);
+
+  delete[] values;
+}
+
+unsigned int OCLPerfMemCreate::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCreate.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCreate.h
new file mode 100644
index 0000000000..790b09a3a3
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCreate.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PERF_MEM_CREATE_H_
+#define _OCL_PERF_MEM_CREATE_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfMemCreate : public OCLTestImp {
+ public:
+  OCLPerfMemCreate();
+  virtual ~OCLPerfMemCreate();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+  unsigned int test_;
+  bool useSubBuf_;
+};
+
+#endif  // _OCL_PERF_MEM_CREATE_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemLatency.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemLatency.cpp
new file mode 100644
index 0000000000..d6d1c4828c
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemLatency.cpp
@@ -0,0 +1,418 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfMemLatency.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+static const unsigned int NUM_SIZES = 16;
+// 2k up to 64MB
+static const unsigned int Sizes[NUM_SIZES] = {
+    2048,   4096,    8192,    16384,   32768,   65536,    131072,   262144,
+    524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864};
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+void OCLPerfMemLatency::genShader() {
+  shader_.clear();
+
+  // DO NOT PUBLISH
+  // Adopted from SiSoft Sandra 2013's memory latency test
+  shader_ +=
+      "#ifdef MAKEVOLATILE\n"
+      "#define VOLATILE volatile\n"
+      "#else\n"
+      "#define VOLATILE\n"
+      "#endif\n"
+      "__kernel\n"
+      //"__attribute__((work_group_size_hint(1, 1, 1)))\n"
+      "void MemWalker(\n"
+      "    global VOLATILE uint * restrict input,\n"
+      "    __global uint * restrict output,\n"
+      "    const uint uCount,  const uint uSize,\n"
+      "    const uint uOffset, const int bMem, const uint repeats)\n"
+      "{\n"
+      "    uint o = uOffset;\n"
+      "    uint lid = 0;//get_local_id(0)*o;\n"
+      "    uint x = lid;\n"
+      "\n"
+      "    for (uint loop = 0; loop < repeats; loop++) {\n"
+      "        uint i = uCount;\n"
+      "        while (i--) {\n"
+      "            x = input[x] /* + o*/;\n"
+      "        }\n"
+      "    }\n"
+      "\n"
+      "#ifdef MAKERW\n"
+      "    input[0] = x;\n"
+      "#endif\n"
+      "    output[0] = x;\n"
+      "}\n";
+
+  // printf("shader:\n%s\n", shader_.c_str());
+  shader_ += "\n\n";
+  shader_ +=
+      "__kernel\n"
+      //"__attribute__((work_group_size_hint(1, 1, 1)))\n"
+      "void Overhead(\n"
+      "    __global uint * restrict input,\n"
+      "    __global uint * restrict output,\n"
+      "    const uint uCount,  const uint uSize,\n"
+      "    const uint uOffset, const int bMem, const uint repeats)\n"
+      "{\n"
+      "#ifdef USE_FLOAT\n"
+      "    float x = (float)input[0];\n"
+      "    for (uint loop = 0; loop < repeats; loop++) {\n"
+      "        uint i = uCount;\n"
+      "        x = (float)uOffset*x;\n"
+      "        while (i--) {\n"
+      "            x += (float)i;\n"
+      "        }\n"
+      "    }\n"
+      "    output[0] = (uint)x;\n"
+      "#else\n"
+      "    uint x = input[0];\n"
+      "    for (uint loop = 0; loop < repeats; loop++) {\n"
+      "        uint i = uCount;\n"
+      "        x = x*uOffset;\n"
+      "        while (i--) {\n"
+      "            x += i;\n"
+      "        }\n"
+      "    }\n"
+      "    output[0] = x;\n"
+      "#endif\n"
+      "}\n";
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+OCLPerfMemLatency::OCLPerfMemLatency() {
+  _numSubTests = NUM_SIZES * 6;
+  maxSize_ = Sizes[NUM_SIZES - 1];
+}
+
+OCLPerfMemLatency::~OCLPerfMemLatency() {}
+
+void OCLPerfMemLatency::setData(cl_mem buffer, unsigned int val) {
+  void *ptr =
+      _wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_WRITE, 0,
+                                   width_, 0, NULL, NULL, &error_);
+  unsigned int *data = (unsigned int *)ptr;
+  for (unsigned int i = 0; i < bufSizeDW_; i++) {
+    data[(i * (1024 + 17)) % bufSizeDW_] = ((i + 1) * (1024 + 17)) % bufSizeDW_;
+  }
+  error_ =
+      _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL);
+  clFinish(cmd_queue_);
+}
+
+void OCLPerfMemLatency::checkData(cl_mem buffer) {
+  void *ptr =
+      _wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_READ, 0,
+                                   sizeof(cl_uint), 0, NULL, NULL, &error_);
+
+  unsigned int *data = (unsigned int *)ptr;
+  if (data[0] != 0) {
+    printf("OutData= 0x%08x\n", data[0]);
+    CHECK_RESULT_NO_RETURN(data[0] != 0, "Data validation failed!\n");
+  }
+  error_ =
+      _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL);
+}
+
+void OCLPerfMemLatency::open(unsigned int test, char *units, double &conversion,
+                             unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  moreThreads = false;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  inBuffer_ = 0;
+  outBuffer_ = 0;
+  _errorFlag = false;  // Reset error code so a single error doesn't prevent
+                       // other subtests from running
+  _errorMsg = "";
+  isAMD_ = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        isAMD_ = true;
+      }
+    }
+
+    delete platforms;
+  }
+
+  width_ = Sizes[test % NUM_SIZES];
+
+  bufSizeDW_ = width_ / sizeof(cl_uint);
+  moreThreads = ((test / NUM_SIZES) % 2) ? true : false;
+  makeVolatile = (test >= 2 * NUM_SIZES) ? true : false;
+  makeRW = (test >= 4 * NUM_SIZES) ? true : false;
+
+  CHECK_RESULT(platform == 0, "Couldn't find OpenCL platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "Failed to allocate devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  device = devices[0];
+
+  free(devices);
+  devices = NULL;
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  inBuffer_ = _wrapper->clCreateBuffer(context_, 0, width_, NULL, &error_);
+  CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed");
+
+  outBuffer_ =
+      _wrapper->clCreateBuffer(context_, 0, 1 * sizeof(cl_uint), NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  genShader();
+  char *tmp = (char *)shader_.c_str();
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&tmp, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  std::string args;
+  args.clear();
+  if (isAMD_) args += " -D USE_FLOAT";
+  if (makeVolatile) args += " -D MAKEVOLATILE";
+  if (makeRW) args += " -D MAKERW";
+
+  error_ =
+      _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "MemWalker", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel(MemWalker) failed");
+
+  kernel2_ = _wrapper->clCreateKernel(program_, "Overhead", &error_);
+  CHECK_RESULT(kernel2_ == 0, "clCreateKernel(Overhead) failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&inBuffer_);
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_);
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint),
+                                    (void *)&bufSizeDW_);
+  error_ = _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_uint),
+                                    (void *)&bufSizeDW_);
+  unsigned int zero = 0;
+  error_ = _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_uint), (void *)&zero);
+  int bMem = 1;
+  error_ = _wrapper->clSetKernelArg(kernel_, 5, sizeof(cl_int), (void *)&bMem);
+  // Limit the repeats, large buffers will have more samples, but the test runs
+  // for a long time
+  repeats_ = std::max((maxSize_ >> 4) / bufSizeDW_, 1u);
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 6, sizeof(cl_uint), (void *)&repeats_);
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel2_, 0, sizeof(cl_mem), (void *)&inBuffer_);
+  error_ = _wrapper->clSetKernelArg(kernel2_, 1, sizeof(cl_mem),
+                                    (void *)&outBuffer_);
+  error_ = _wrapper->clSetKernelArg(kernel2_, 2, sizeof(cl_uint),
+                                    (void *)&bufSizeDW_);
+  error_ = _wrapper->clSetKernelArg(kernel2_, 3, sizeof(cl_uint),
+                                    (void *)&bufSizeDW_);
+  error_ =
+      _wrapper->clSetKernelArg(kernel2_, 4, sizeof(cl_uint), (void *)&zero);
+  error_ = _wrapper->clSetKernelArg(kernel2_, 5, sizeof(cl_int), (void *)&bMem);
+  error_ =
+      _wrapper->clSetKernelArg(kernel2_, 6, sizeof(cl_uint), (void *)&repeats_);
+
+  setData(inBuffer_, (int)1.0f);
+}
+
+void OCLPerfMemLatency::run(void) {
+  int global = 1;
+  int local = 1;
+
+  if (moreThreads) {
+    if (isAMD_) {
+      global *= 64;
+      local *= 64;
+    } else {
+      global *= 32;
+      local *= 32;
+    }
+  }
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  // Warm-up
+  unsigned int warmup = 128;
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint), (void *)&warmup);
+  error_ = _wrapper->clEnqueueNDRangeKernel(
+      cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+      (const size_t *)local_work_size, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint),
+                                    (void *)&bufSizeDW_);
+  _wrapper->clFinish(cmd_queue_);
+
+  // Restore input buffer when finished as it may have been modified by RW test
+  setData(inBuffer_, (int)1.0f);
+
+  CPerfCounter timer, timer2;
+
+  timer.Reset();
+  timer.Start();
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(
+      cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+      (const size_t *)local_work_size, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+
+  checkData(outBuffer_);
+
+  timer2.Reset();
+  timer2.Start();
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(
+      cmd_queue_, kernel2_, 1, NULL, (const size_t *)global_work_size,
+      (const size_t *)local_work_size, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+
+  _wrapper->clFinish(cmd_queue_);
+
+  timer2.Stop();
+  double sec = timer.GetElapsedTime() - timer2.GetElapsedTime();
+
+  // Read latency in ns
+  double perf = sec * (double)(1e09) / ((double)bufSizeDW_ * (double)repeats_);
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  char buf2[32];
+  if (makeRW)
+    SNPRINTF(buf2, sizeof(buf), "volatileRW");
+  else if (makeVolatile)
+    SNPRINTF(buf2, sizeof(buf), "volatile");
+  else
+    buf2[0] = '\0';
+  SNPRINTF(buf, sizeof(buf), "%10s %2d threads, %8d reads, %5d repeats (ns)",
+           buf2, global, bufSizeDW_, repeats_);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfMemLatency::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+
+  if (inBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(inBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(inBuffer_) failed");
+  }
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (kernel2_) {
+    error_ = _wrapper->clReleaseKernel(kernel2_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemLatency.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemLatency.h
new file mode 100644
index 0000000000..0e2f0f4e98
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemLatency.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_MEMLATENCY_H_
+#define _OCL_MEMLATENCY_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfMemLatency : public OCLTestImp {
+ public:
+  OCLPerfMemLatency();
+  virtual ~OCLPerfMemLatency();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void genShader(void);
+  void setData(cl_mem buffer, unsigned int data);
+  void checkData(cl_mem buffer);
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_kernel kernel2_;
+  cl_mem inBuffer_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int width_;
+  unsigned int bufSizeDW_;
+  unsigned int repeats_;
+  unsigned int maxSize_;
+  bool isAMD_;
+  bool moreThreads;
+  bool makeVolatile;
+  bool makeRW;
+};
+
+#endif  // _OCL_MEMLATENCY_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferReadSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferReadSpeed.cpp
new file mode 100644
index 0000000000..f29724fc8b
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferReadSpeed.cpp
@@ -0,0 +1,347 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfPinnedBufferReadSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <complex>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 8
+// 256KB, 1 MB, 4MB, 16 MB
+static const unsigned int Sizes[NUM_SIZES] = {
+    1024, 4 * 1024, 8 * 1024, 16 * 1024, 262144, 1048576, 4194304, 16777216};
+
+static const unsigned int Iterations[2] = {
+    1, OCLPerfPinnedBufferReadSpeed::NUM_ITER};
+#define NUM_OFFSETS 2
+static const unsigned int offsets[NUM_OFFSETS] = {0, 16};
+#define NUM_SUBTESTS (1 + NUM_OFFSETS)
+
+static cl_uint blockedSubtests;
+
+OCLPerfPinnedBufferReadSpeed::OCLPerfPinnedBufferReadSpeed() {
+  _numSubTests = NUM_SIZES * NUM_SUBTESTS * 2;
+  blockedSubtests = _numSubTests;
+  _numSubTests += NUM_SIZES * NUM_SUBTESTS;
+}
+
+OCLPerfPinnedBufferReadSpeed::~OCLPerfPinnedBufferReadSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+const char *blkStr[2] = {"n/b", "blk"};
+
+void OCLPerfPinnedBufferReadSpeed::open(unsigned int test, char *units,
+                                        double &conversion,
+                                        unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  inBuffer_ = 0;
+  outBuffer_ = 0;
+  persistent = false;
+  allocHostPtr = false;
+  useHostPtr = false;
+  hostMem = NULL;
+  alignedMem = NULL;
+  alignment = 4096;
+  isAMD = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        isAMD = true;
+      }
+      // platform = platforms[_platformIndex];
+      // break;
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  char getVersion[128];
+  error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION,
+                                       sizeof(getVersion), getVersion, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+  platformVersion[0] = getVersion[7];
+  platformVersion[1] = getVersion[8];
+  platformVersion[2] = getVersion[9];
+  platformVersion[3] = '\0';
+  bufSize_ = Sizes[_openTest % NUM_SIZES];
+
+  if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 0) {
+    useHostPtr = true;
+    offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 1];
+  } else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 0) {
+    allocHostPtr = true;
+  }
+
+  if (_openTest < blockedSubtests) {
+    numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS)];
+  } else {
+    numIter = 4 * OCLPerfPinnedBufferReadSpeed::NUM_ITER /
+              ((_openTest % NUM_SIZES) + 1);
+  }
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_WRITE_ONLY;
+  if (allocHostPtr) {
+    flags |= CL_MEM_ALLOC_HOST_PTR;
+  } else if (useHostPtr) {
+    flags |= CL_MEM_USE_HOST_PTR;
+    hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset);
+    CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed");
+    alignedMem =
+        (char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) +
+                 offset);
+  }
+  inBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, 0, &error_);
+  CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+  outBuffer_ =
+      _wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  // Force memory to be on GPU if possible
+  {
+    cl_mem memBuffer =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed");
+
+    _wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, outBuffer_, 0, 0,
+                                  bufSize_, 0, NULL, NULL);
+    _wrapper->clFinish(cmd_queue_);
+
+    _wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, inBuffer_, 0, 0,
+                                  bufSize_, 0, NULL, NULL);
+    _wrapper->clFinish(cmd_queue_);
+
+    _wrapper->clReleaseMemObject(memBuffer);
+  }
+}
+
+void OCLPerfPinnedBufferReadSpeed::run(void) {
+  CPerfCounter timer;
+  void *mem =
+      _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE, CL_MAP_READ,
+                                   0, bufSize_, 0, NULL, NULL, &error_);
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE;
+
+  // Warm up
+  error_ = _wrapper->clEnqueueReadBuffer(cmd_queue_, inBuffer_, CL_TRUE, 0,
+                                         bufSize_, mem, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueReadBuffer failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < numIter; i++) {
+    error_ = _wrapper->clEnqueueReadBuffer(cmd_queue_, inBuffer_, blocking, 0,
+                                           bufSize_, mem, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueReadBuffer failed");
+  }
+
+  if (blocking != CL_TRUE) {
+    _wrapper->clFinish(cmd_queue_);
+  }
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Buffer read bandwidth in GB/s
+  double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char str[256];
+  if (allocHostPtr) {
+    SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)");
+  } else if (useHostPtr) {
+    SNPRINTF(str, sizeof(str), "off: %4d   USE_HOST_PTR (GB/s)", offset);
+  }
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %31s ", bufSize_,
+           blkStr[blocking], numIter, str);
+  testDescString = buf;
+
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0,
+                                             NULL, NULL);
+  CHECK_RESULT(error_, "clEnqueueUnmapMemObject failed");
+}
+
+unsigned int OCLPerfPinnedBufferReadSpeed::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+  if (inBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(inBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(inBuffer_) failed");
+  }
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+  if (hostMem) {
+    free(hostMem);
+  }
+
+  return _crcword;
+}
+
+void OCLPerfPinnedBufferReadRectSpeed::run(void) {
+  CPerfCounter timer;
+  void *mem =
+      _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE, CL_MAP_READ,
+                                   0, bufSize_, 0, NULL, NULL, &error_);
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  size_t width = static_cast<size_t>(sqrt(static_cast<float>(bufSize_)));
+  cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE;
+  size_t bufOrigin[3] = {0, 0, 0};
+  size_t hostOrigin[3] = {0, 0, 0};
+  size_t region[3] = {width, width, 1};
+  // Clamp iteration count to reduce test run time
+  unsigned int testNumIter;
+  testNumIter = (numIter < 100 ? numIter : 100);
+
+  // Skip for 1.0 platforms
+  if ((platformVersion[0] == '1') && (platformVersion[2] == '0')) {
+    testDescString = " SKIPPED ";
+    return;
+  }
+  // Warm up
+  error_ = _wrapper->clEnqueueReadBufferRect(
+      cmd_queue_, inBuffer_, CL_TRUE, bufOrigin, hostOrigin, region, width, 0,
+      width, 0, mem, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueReadBufferRect failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < testNumIter; i++) {
+    error_ = _wrapper->clEnqueueReadBufferRect(
+        cmd_queue_, inBuffer_, blocking, bufOrigin, hostOrigin, region, width,
+        0, width, 0, mem, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueReadBufferRect failed");
+  }
+
+  if (blocking != CL_TRUE) {
+    _wrapper->clFinish(cmd_queue_);
+  }
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Buffer read bandwidth in GB/s
+  double perf = ((double)bufSize_ * testNumIter * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char str[256];
+  if (allocHostPtr) {
+    SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)");
+  } else if (useHostPtr) {
+    SNPRINTF(str, sizeof(str), "off: %4d   USE_HOST_PTR (GB/s)", offset);
+  }
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %31s ", bufSize_,
+           blkStr[blocking], testNumIter, str);
+  testDescString = buf;
+
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0,
+                                             NULL, NULL);
+  CHECK_RESULT(error_, "clEnqueueUnmapMemObject failed");
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferReadSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferReadSpeed.h
new file mode 100644
index 0000000000..1999cac84e
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferReadSpeed.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PinnedBufferReadSpeed_H_
+#define _OCL_PinnedBufferReadSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfPinnedBufferReadSpeed : public OCLTestImp {
+ public:
+  OCLPerfPinnedBufferReadSpeed();
+  virtual ~OCLPerfPinnedBufferReadSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 1000;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem inBuffer_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int bufSize_;
+  bool persistent;
+  bool allocHostPtr;
+  bool useHostPtr;
+  unsigned int numIter;
+  char* hostMem;
+  char* alignedMem;
+  size_t alignment;
+  unsigned int offset;
+  bool isAMD;
+  char platformVersion[32];
+};
+
+class OCLPerfPinnedBufferReadRectSpeed : public OCLPerfPinnedBufferReadSpeed {
+ public:
+  OCLPerfPinnedBufferReadRectSpeed() : OCLPerfPinnedBufferReadSpeed() {}
+
+ public:
+  virtual void run(void);
+};
+
+#endif  // _OCL_PinnedBufferReadSpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferWriteSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferWriteSpeed.cpp
new file mode 100644
index 0000000000..2fccd41163
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferWriteSpeed.cpp
@@ -0,0 +1,342 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfPinnedBufferWriteSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <complex>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 8
+// 256KB, 1 MB, 4MB, 16 MB
+static const unsigned int Sizes[NUM_SIZES] = {
+    1024, 4 * 1024, 8 * 1024, 16 * 1024, 262144, 1048576, 4194304, 16777216};
+
+static cl_uint blockedSubtests;
+
+static const unsigned int Iterations[2] = {
+    1, OCLPerfPinnedBufferWriteSpeed::NUM_ITER};
+#define NUM_OFFSETS 2
+static const unsigned int offsets[NUM_OFFSETS] = {0, 16};
+#define NUM_SUBTESTS (1 + NUM_OFFSETS)
+OCLPerfPinnedBufferWriteSpeed::OCLPerfPinnedBufferWriteSpeed() {
+  _numSubTests = NUM_SIZES * NUM_SUBTESTS * 2;
+  blockedSubtests = _numSubTests;
+  _numSubTests += NUM_SIZES * NUM_SUBTESTS;
+}
+
+OCLPerfPinnedBufferWriteSpeed::~OCLPerfPinnedBufferWriteSpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+extern const char *blkStr[2];
+
+void OCLPerfPinnedBufferWriteSpeed::open(unsigned int test, char *units,
+                                         double &conversion,
+                                         unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  outBuffer_ = 0;
+  persistent = false;
+  allocHostPtr = false;
+  useHostPtr = false;
+  hostMem = NULL;
+  alignedMem = NULL;
+  alignment = 4096;
+  isAMD = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        isAMD = true;
+      }
+      // platform = platforms[_platformIndex];
+      // break;
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  char getVersion[128];
+  error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION,
+                                       sizeof(getVersion), getVersion, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+  platformVersion[0] = getVersion[7];
+  platformVersion[1] = getVersion[8];
+  platformVersion[2] = getVersion[9];
+  platformVersion[3] = '\0';
+  bufSize_ = Sizes[_openTest % NUM_SIZES];
+
+  if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 0) {
+    useHostPtr = true;
+    offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 1];
+  } else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 0) {
+    allocHostPtr = true;
+  }
+
+  if (_openTest < blockedSubtests) {
+    numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS)];
+  } else {
+    numIter = 4 * OCLPerfPinnedBufferWriteSpeed::NUM_ITER /
+              ((_openTest % NUM_SIZES) + 1);
+  }
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_READ_ONLY;
+  if (allocHostPtr) {
+    flags |= CL_MEM_ALLOC_HOST_PTR;
+  } else if (useHostPtr) {
+    flags |= CL_MEM_USE_HOST_PTR;
+    hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset);
+    CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed");
+    alignedMem =
+        (char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) +
+                 offset);
+  }
+  inBuffer_ =
+      _wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_);
+  CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed");
+  outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, 0, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  // Force memory to be on GPU if possible
+  {
+    cl_mem memBuffer =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed");
+
+    _wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, inBuffer_, 0, 0,
+                                  bufSize_, 0, NULL, NULL);
+    _wrapper->clFinish(cmd_queue_);
+
+    _wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, outBuffer_, 0, 0,
+                                  bufSize_, 0, NULL, NULL);
+    _wrapper->clFinish(cmd_queue_);
+
+    _wrapper->clReleaseMemObject(memBuffer);
+  }
+}
+
+void OCLPerfPinnedBufferWriteSpeed::run(void) {
+  CPerfCounter timer;
+  void *mem =
+      _wrapper->clEnqueueMapBuffer(cmd_queue_, inBuffer_, CL_TRUE, CL_MAP_WRITE,
+                                   0, bufSize_, 0, NULL, NULL, &error_);
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE;
+
+  // Warm up
+  error_ = _wrapper->clEnqueueWriteBuffer(cmd_queue_, outBuffer_, CL_TRUE, 0,
+                                          bufSize_, mem, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueWriteBuffer failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < numIter; i++) {
+    error_ = _wrapper->clEnqueueWriteBuffer(cmd_queue_, outBuffer_, blocking, 0,
+                                            bufSize_, mem, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueWriteBuffer failed");
+  }
+  if (blocking != CL_TRUE) {
+    _wrapper->clFinish(cmd_queue_);
+  }
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Buffer read bandwidth in GB/s
+  double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char str[256];
+  if (allocHostPtr) {
+    SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)");
+  } else if (useHostPtr) {
+    SNPRINTF(str, sizeof(str), "off: %4d   USE_HOST_PTR (GB/s)", offset);
+  }
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %31s ", bufSize_,
+           blkStr[blocking], numIter, str);
+  testDescString = buf;
+
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, inBuffer_, mem, 0,
+                                             NULL, NULL);
+  CHECK_RESULT(error_, "clEnqueueUnmapMemObject failed");
+}
+
+unsigned int OCLPerfPinnedBufferWriteSpeed::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+  if (inBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(inBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(inBuffer_) failed");
+  }
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+  if (hostMem) {
+    free(hostMem);
+  }
+
+  return _crcword;
+}
+
+void OCLPerfPinnedBufferWriteRectSpeed::run(void) {
+  CPerfCounter timer;
+  void *mem =
+      _wrapper->clEnqueueMapBuffer(cmd_queue_, inBuffer_, CL_TRUE, CL_MAP_READ,
+                                   0, bufSize_, 0, NULL, NULL, &error_);
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  size_t width = static_cast<size_t>(sqrt(static_cast<float>(bufSize_)));
+  size_t bufOrigin[3] = {0, 0, 0};
+  size_t hostOrigin[3] = {0, 0, 0};
+  size_t region[3] = {width, width, 1};
+  // Clamp iteration count to reduce test run time
+  unsigned int testNumIter;
+  testNumIter = (numIter < 100 ? numIter : 100);
+  cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE;
+
+  // Skip for 1.0 platforms
+  if ((platformVersion[0] == '1') && (platformVersion[2] == '0')) {
+    testDescString = " SKIPPED ";
+    return;
+  }
+  // Warm up
+  error_ = _wrapper->clEnqueueWriteBufferRect(
+      cmd_queue_, outBuffer_, CL_TRUE, bufOrigin, hostOrigin, region, width, 0,
+      width, 0, mem, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueReadBufferRect failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < testNumIter; i++) {
+    error_ = _wrapper->clEnqueueWriteBufferRect(
+        cmd_queue_, outBuffer_, blocking, bufOrigin, hostOrigin, region, width,
+        0, width, 0, mem, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueWriteBufferRect failed");
+  }
+  if (blocking != CL_TRUE) {
+    _wrapper->clFinish(cmd_queue_);
+  }
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Buffer read bandwidth in GB/s
+  double perf = ((double)bufSize_ * testNumIter * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char str[256];
+  if (allocHostPtr) {
+    SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)");
+  } else if (useHostPtr) {
+    SNPRINTF(str, sizeof(str), "off: %4d   USE_HOST_PTR (GB/s)", offset);
+  }
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %31s ", bufSize_,
+           blkStr[blocking], testNumIter, str);
+  testDescString = buf;
+
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, inBuffer_, mem, 0,
+                                             NULL, NULL);
+  CHECK_RESULT(error_, "clEnqueueUnmapMemObject failed");
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferWriteSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferWriteSpeed.h
new file mode 100644
index 0000000000..40b2620053
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferWriteSpeed.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PinnedBufferWriteSpeed_H_
+#define _OCL_PinnedBufferWriteSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfPinnedBufferWriteSpeed : public OCLTestImp {
+ public:
+  OCLPerfPinnedBufferWriteSpeed();
+  virtual ~OCLPerfPinnedBufferWriteSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 1000;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem inBuffer_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int bufSize_;
+  bool persistent;
+  bool allocHostPtr;
+  bool useHostPtr;
+  unsigned int numIter;
+  char* hostMem;
+  char* alignedMem;
+  size_t alignment;
+  unsigned int offset;
+  bool isAMD;
+  char platformVersion[32];
+};
+
+class OCLPerfPinnedBufferWriteRectSpeed : public OCLPerfPinnedBufferWriteSpeed {
+ public:
+  OCLPerfPinnedBufferWriteRectSpeed() : OCLPerfPinnedBufferWriteSpeed() {}
+
+ public:
+  virtual void run(void);
+};
+
+#endif  // _OCL_PinnedBufferWriteSpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPipeCopySpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPipeCopySpeed.cpp
new file mode 100644
index 0000000000..3c4bfd66a3
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPipeCopySpeed.cpp
@@ -0,0 +1,504 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfPipeCopySpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <complex>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define KERNEL_CODE(...) #__VA_ARGS__
+
+const static char * strKernel =
+{
+    KERNEL_CODE(
+    \n
+        kernel void initPipe(global DATA_TYPE* inBuf, write_only pipe DATA_TYPE outPipe)\n
+        {\n
+            int gid = get_global_id(0);\n
+            write_pipe(outPipe, &inBuf[gid]);\n
+        }\n
+    \n
+        kernel void copyPipe(read_only pipe DATA_TYPE inPipe, write_only pipe DATA_TYPE outPipe)\n
+        {\n
+            DATA_TYPE tmp;\n
+            read_pipe(inPipe, &tmp);\n
+            write_pipe(outPipe, &tmp);\n
+        }\n
+    \n
+        kernel void readPipe(read_only pipe DATA_TYPE inPipe, global DATA_TYPE* outBuf)\n
+        {\n
+            int gid = get_global_id(0);\n
+            DATA_TYPE tmp;\n
+            read_pipe(inPipe, &tmp);\n
+            outBuf[gid] = tmp;\n
+        }\n
+    \n
+        kernel void initPipe_reserve(global DATA_TYPE* inBuf, write_only pipe DATA_TYPE outPipe)\n
+        {\n
+            int gid = get_global_id(0);\n
+            local reserve_id_t resId;\n
+            resId = reserve_write_pipe(outPipe, 1);\n
+            if (is_valid_reserve_id(resId)) {\n
+                write_pipe(outPipe, resId, 0, &inBuf[gid]);\n
+                commit_write_pipe(outPipe, resId);\n
+            }\n
+        }\n
+    \n
+        kernel void copyPipe_reserve(read_only pipe DATA_TYPE inPipe, write_only pipe DATA_TYPE outPipe)\n
+        {\n
+            local reserve_id_t resId;\n
+            resId = reserve_read_pipe(inPipe, 1);\n
+            if (is_valid_reserve_id(resId)) {\n
+                DATA_TYPE tmp;\n
+                read_pipe(inPipe, resId, 0, &tmp);\n
+                commit_read_pipe(inPipe, resId);\n
+                resId = reserve_write_pipe(outPipe, 1);\n
+                if (is_valid_reserve_id(resId)) {\n
+                    write_pipe(outPipe, resId, 0, &tmp);\n
+                    commit_write_pipe(outPipe, resId);\n
+                }\n
+            }\n
+        }\n
+    \n
+        kernel void readPipe_reserve(read_only pipe DATA_TYPE inPipe, global DATA_TYPE* outBuf)\n
+        {\n
+            int gid = get_global_id(0);\n
+            local reserve_id_t resId;\n
+            resId = reserve_read_pipe(inPipe, 1);\n
+            if (is_valid_reserve_id(resId)) {\n
+                DATA_TYPE tmp;\n
+                read_pipe(inPipe, resId, 0, &tmp);\n
+                commit_read_pipe(inPipe, resId);\n
+                outBuf[gid] = tmp;\n
+            }\n
+        }\n
+    \n
+        kernel void initPipe_wg(global DATA_TYPE* inBuf, write_only pipe DATA_TYPE outPipe)\n
+        {\n
+            int gid = get_global_id(0);\n
+            local reserve_id_t resId;\n
+            resId = work_group_reserve_write_pipe(outPipe, get_local_size(0));\n
+            if (is_valid_reserve_id(resId)) {\n
+                write_pipe(outPipe, resId, get_local_id(0), &inBuf[gid]);\n
+                work_group_commit_write_pipe(outPipe, resId);\n
+            }\n
+        }\n
+    \n
+        kernel void copyPipe_wg(read_only pipe DATA_TYPE inPipe, write_only pipe DATA_TYPE outPipe)\n
+        {\n
+            local reserve_id_t resId;\n
+            resId = work_group_reserve_read_pipe(inPipe, get_local_size(0));\n
+            if (is_valid_reserve_id(resId)) {\n
+                DATA_TYPE tmp;\n
+                read_pipe(inPipe, resId, get_local_id(0), &tmp);\n
+                work_group_commit_read_pipe(inPipe, resId);\n
+                resId = work_group_reserve_write_pipe(outPipe, get_local_size(0));\n
+                if (is_valid_reserve_id(resId)) {\n
+                    write_pipe(outPipe, resId, get_local_id(0), &tmp);\n
+                    work_group_commit_write_pipe(outPipe, resId);\n
+                }\n
+            }\n
+        }\n
+    \n
+        kernel void readPipe_wg(read_only pipe DATA_TYPE inPipe, global DATA_TYPE* outBuf)\n
+        {\n
+            int gid = get_global_id(0);\n
+            local reserve_id_t resId;\n
+            resId = work_group_reserve_read_pipe(inPipe, get_local_size(0));\n
+            if (is_valid_reserve_id(resId)) {\n
+                DATA_TYPE tmp;\n
+                read_pipe(inPipe, resId, get_local_id(0), &tmp);\n
+                work_group_commit_read_pipe(inPipe, resId);\n
+                outBuf[gid] = tmp;\n
+            }\n
+        }\n
+    \n
+\x23 ifdef SUBGROUPS\n
+        \x23 pragma OPENCL EXTENSION cl_khr_subgroups : enable\n
+        kernel __attribute__((reqd_work_group_size(64,1,1))) void initPipe_sg(global DATA_TYPE* inBuf, write_only pipe DATA_TYPE outPipe)\n
+        {\n
+            int gid = get_global_id(0);\n
+            local reserve_id_t resId;\n
+            resId = sub_group_reserve_write_pipe(outPipe, get_local_size(0));\n
+            if (is_valid_reserve_id(resId)) {\n
+                write_pipe(outPipe, resId, get_local_id(0), &inBuf[gid]);\n
+                sub_group_commit_write_pipe(outPipe, resId);\n
+            }\n
+        }\n
+    \n
+        kernel __attribute__((reqd_work_group_size(64,1,1))) void copyPipe_sg(read_only pipe DATA_TYPE inPipe, write_only pipe DATA_TYPE outPipe)\n
+        {\n
+            local reserve_id_t resId;\n
+            resId = sub_group_reserve_read_pipe(inPipe, get_local_size(0));\n
+            if (is_valid_reserve_id(resId)) {\n
+                DATA_TYPE tmp;\n
+                read_pipe(inPipe, resId, get_local_id(0), &tmp);\n
+                sub_group_commit_read_pipe(inPipe, resId);\n
+                resId = sub_group_reserve_write_pipe(outPipe, get_local_size(0));\n
+                if (is_valid_reserve_id(resId)) {\n
+                    write_pipe(outPipe, resId, get_local_id(0), &tmp);\n
+                    sub_group_commit_write_pipe(outPipe, resId);\n
+                }\n
+            }\n
+        }\n
+    \n
+        kernel __attribute__((reqd_work_group_size(64,1,1))) void readPipe_sg(read_only pipe DATA_TYPE inPipe, global DATA_TYPE* outBuf)\n
+        {\n
+            int gid = get_global_id(0);\n
+            local reserve_id_t resId;\n
+            resId = sub_group_reserve_read_pipe(inPipe, get_local_size(0));\n
+            if (is_valid_reserve_id(resId)) {\n
+                DATA_TYPE tmp;\n
+                read_pipe(inPipe, resId, get_local_id(0), &tmp);\n
+                sub_group_commit_read_pipe(inPipe, resId);\n
+                outBuf[gid] = tmp;\n
+            }\n
+        }\n
+\x23 endif\n
+    \n
+    )
+};
+
+#define NUM_SIZES 6
+// 4KB, 8KB, 64KB, 256KB, 1 MB, 4MB
+static const unsigned int Sizes[NUM_SIZES] = {4096,   8192,    65536,
+                                              262144, 1048576, 4194304};
+
+#define NUM_TYPES 3
+static const char *types[NUM_TYPES] = {"int", "int4", "int16"};
+static const unsigned int typeSize[NUM_TYPES] = {4, 16, 64};
+
+#define NUM_TESTS 4
+
+OCLPerfPipeCopySpeed::OCLPerfPipeCopySpeed() {
+  _numSubTests = NUM_TESTS * NUM_SIZES * NUM_TYPES;
+}
+
+OCLPerfPipeCopySpeed::~OCLPerfPipeCopySpeed() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfPipeCopySpeed::setData(cl_mem buffer) {
+  int *mem;
+  int dwTypeSize = (int)(typeSize[typeIdx_]) >> 2;
+  mem = (int *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, CL_TRUE,
+                                            CL_MAP_WRITE, 0, bufSize_, 0, NULL,
+                                            NULL, &error_);
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  for (int i = 0; i < (int)numElements; i++) {
+    for (int j = 0; j < dwTypeSize; j++) {
+      mem[i * dwTypeSize + j] = i;
+    }
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, (void *)mem, 0,
+                                             NULL, NULL);
+  CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed");
+  clFinish(cmd_queue_);
+}
+
+void OCLPerfPipeCopySpeed::checkData(cl_mem buffer) {
+  int *mem;
+  int dwTypeSize = (int)(typeSize[typeIdx_]) >> 2;
+  char *histo;
+  histo = (char *)malloc(numElements * sizeof(char));
+  memset(histo, 0, sizeof(char) * numElements);
+  mem = (int *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, CL_TRUE,
+                                            CL_MAP_READ, 0, bufSize_, 0, NULL,
+                                            NULL, &error_);
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  int errCnt = 0;
+  for (int i = 0; (i < (int)numElements) && (errCnt < 5); i++) {
+    int tmp = mem[dwTypeSize * i];
+    for (int j = 1; (j < dwTypeSize) && (errCnt < 5); j++) {
+      if (mem[i * dwTypeSize + j] != tmp) {
+        // BAD DATA!
+        printf("BAD DATA at element %d, ref %d, got %d\n", i, tmp,
+               mem[i * dwTypeSize + j]);
+        errCnt++;
+      }
+    }
+    if (histo[tmp] == 1) {
+      printf("BAD DATA at element %d, val %d already found!\n", i, tmp);
+      errCnt++;
+    }
+    histo[tmp] = 1;
+  }
+  errCnt = 0;
+  for (int i = 0; (i < (int)numElements) && (errCnt < 5); i++) {
+    if (histo[i] != 1) {
+      printf("BAD DATA at element %d, val not found!\n", i);
+      errCnt++;
+    }
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, (void *)mem, 0,
+                                             NULL, NULL);
+  CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed");
+  clFinish(cmd_queue_);
+  free(histo);
+}
+
+void OCLPerfPipeCopySpeed::open(unsigned int test, char *units,
+                                double &conversion, unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  _crcword = 0;
+  conversion = 1.0f;
+
+  cl_device_id device = devices_[deviceId];
+  cmd_queue_ = cmdQueues_[_deviceId];
+
+  program_ = 0;
+  initPipe_ = 0;
+  copyPipe_ = 0;
+  readPipe_ = 0;
+  srcBuffer_ = 0;
+  dstBuffer_ = 0;
+  pipe_[0] = 0;
+  pipe_[1] = 0;
+  failed_ = false;
+  subgroupSupport_ = false;
+
+  bufSize_ = Sizes[test % NUM_SIZES];
+  typeIdx_ = (test / NUM_SIZES) % NUM_TYPES;
+  testIdx_ = test / (NUM_SIZES * NUM_TYPES);
+
+  numIter = NUM_ITER;
+
+  char getVersion[128];
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_VERSION,
+                                     sizeof(getVersion), getVersion, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (getVersion[7] < '2') {
+    failed_ = true;
+    _errorMsg = "OpenCL 2.0 not supported";
+    return;
+  }
+
+  srcBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, bufSize_,
+                                        NULL, &error_);
+  CHECK_RESULT(srcBuffer_ == 0, "clCreateBuffer(srcBuffer) failed");
+
+  numElements = bufSize_ / typeSize[typeIdx_];
+  char args[100];
+
+#if defined(CL_VERSION_2_0)
+  pipe_[0] =
+      _wrapper->clCreatePipe(context_, CL_MEM_HOST_NO_ACCESS,
+                             typeSize[typeIdx_], numElements, NULL, &error_);
+  CHECK_RESULT(pipe_[0] == 0, "clCreatePipe(pipe_[0]) failed");
+
+  pipe_[1] =
+      _wrapper->clCreatePipe(context_, CL_MEM_HOST_NO_ACCESS,
+                             typeSize[typeIdx_], numElements, NULL, &error_);
+  CHECK_RESULT(pipe_[1] == 0, "clCreatePipe(pipe_[1]) failed");
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  char *p = strstr(charbuf, "cl_khr_subgroups");
+  if (p) {
+    subgroupSupport_ = true;
+    SNPRINTF(args, sizeof(args), "-cl-std=CL2.0 -D DATA_TYPE=%s -D SUBGROUPS",
+             types[typeIdx_]);
+  } else {
+    if (test >= (NUM_SIZES * NUM_TYPES * 3)) {
+      // No support for subgroups, so skip these tests
+      failed_ = true;
+      _errorMsg = "Subgroup extension not supported";
+      return;
+    }
+    SNPRINTF(args, sizeof(args), "-cl-std=CL2.0 -D DATA_TYPE=%s",
+             types[typeIdx_]);
+  }
+#endif
+
+  dstBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, bufSize_,
+                                        NULL, &error_);
+  CHECK_RESULT(dstBuffer_ == 0, "clCreateBuffer(dstBuffer) failed");
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &device, args, NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    printf("\nerror: %d\n", error_);
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  if (testIdx_ == 0) {
+    initPipe_ = _wrapper->clCreateKernel(program_, "initPipe", &error_);
+    CHECK_RESULT(initPipe_ == 0, "clCreateKernel(initPipe) failed");
+    copyPipe_ = _wrapper->clCreateKernel(program_, "copyPipe", &error_);
+    CHECK_RESULT(copyPipe_ == 0, "clCreateKernel(copyPipe) failed");
+    readPipe_ = _wrapper->clCreateKernel(program_, "readPipe", &error_);
+    CHECK_RESULT(readPipe_ == 0, "clCreateKernel(readPipe) failed");
+    testName_ = "r/w";
+  } else if (testIdx_ == 1) {
+    initPipe_ = _wrapper->clCreateKernel(program_, "initPipe_reserve", &error_);
+    CHECK_RESULT(initPipe_ == 0, "clCreateKernel(initPipe) failed");
+    copyPipe_ = _wrapper->clCreateKernel(program_, "copyPipe_reserve", &error_);
+    CHECK_RESULT(copyPipe_ == 0, "clCreateKernel(copyPipe) failed");
+    readPipe_ = _wrapper->clCreateKernel(program_, "readPipe_reserve", &error_);
+    CHECK_RESULT(readPipe_ == 0, "clCreateKernel(readPipe) failed");
+    numIter = 10;  // Limit iteration count because this test is very slow
+    testName_ = "r/w w/ reserve";
+  } else if (testIdx_ == 2) {
+    initPipe_ = _wrapper->clCreateKernel(program_, "initPipe_wg", &error_);
+    CHECK_RESULT(initPipe_ == 0, "clCreateKernel(initPipe) failed");
+    copyPipe_ = _wrapper->clCreateKernel(program_, "copyPipe_wg", &error_);
+    CHECK_RESULT(copyPipe_ == 0, "clCreateKernel(copyPipe) failed");
+    readPipe_ = _wrapper->clCreateKernel(program_, "readPipe_wg", &error_);
+    CHECK_RESULT(readPipe_ == 0, "clCreateKernel(readPipe) failed");
+    testName_ = "wg r/w w/ reserve";
+  } else if (testIdx_ == 3) {
+    initPipe_ = _wrapper->clCreateKernel(program_, "initPipe_sg", &error_);
+    CHECK_RESULT(initPipe_ == 0, "clCreateKernel(initPipe) failed");
+    copyPipe_ = _wrapper->clCreateKernel(program_, "copyPipe_sg", &error_);
+    CHECK_RESULT(copyPipe_ == 0, "clCreateKernel(copyPipe) failed");
+    readPipe_ = _wrapper->clCreateKernel(program_, "readPipe_sg", &error_);
+    CHECK_RESULT(readPipe_ == 0, "clCreateKernel(readPipe) failed");
+    testName_ = "sg r/w w/ reserve";
+  } else {
+    CHECK_RESULT(1, "Invalid test index!");
+  }
+  setData(srcBuffer_);
+}
+
+void OCLPerfPipeCopySpeed::run(void) {
+  if (failed_) return;
+  CPerfCounter timer;
+  size_t global_work_size[1] = {(size_t)numElements};
+  size_t local_work_size[1] = {64};
+
+  error_ = _wrapper->clSetKernelArg(initPipe_, 0, sizeof(cl_mem),
+                                    (void *)&srcBuffer_);
+  error_ =
+      _wrapper->clSetKernelArg(initPipe_, 1, sizeof(cl_mem), (void *)&pipe_[0]);
+  // Warm up
+  error_ = _wrapper->clEnqueueNDRangeKernel(
+      cmd_queue_, initPipe_, 1, NULL, (const size_t *)global_work_size,
+      (const size_t *)local_work_size, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(copyPipe_, 0, sizeof(cl_mem), (void *)&pipe_[0]);
+  error_ =
+      _wrapper->clSetKernelArg(copyPipe_, 1, sizeof(cl_mem), (void *)&pipe_[1]);
+  error_ = _wrapper->clEnqueueNDRangeKernel(
+      cmd_queue_, copyPipe_, 1, NULL, (const size_t *)global_work_size,
+      (const size_t *)local_work_size, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < numIter; i++) {
+    error_ = _wrapper->clSetKernelArg(copyPipe_, 0, sizeof(cl_mem),
+                                      (void *)&pipe_[(i + 1) % 2]);
+    error_ = _wrapper->clSetKernelArg(copyPipe_, 1, sizeof(cl_mem),
+                                      (void *)&pipe_[i % 2]);
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, copyPipe_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  timer.Stop();
+
+  // pipe[(numIter-1)%2 has the data
+  error_ = _wrapper->clSetKernelArg(readPipe_, 0, sizeof(cl_mem),
+                                    (void *)&pipe_[(numIter - 1) % 2]);
+  error_ = _wrapper->clSetKernelArg(readPipe_, 1, sizeof(cl_mem),
+                                    (void *)&dstBuffer_);
+  error_ = _wrapper->clEnqueueNDRangeKernel(
+      cmd_queue_, readPipe_, 1, NULL, (const size_t *)global_work_size,
+      (const size_t *)local_work_size, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel(readPipe) failed");
+  error_ = _wrapper->clFinish(cmd_queue_);
+  checkData(dstBuffer_);
+  double sec = timer.GetElapsedTime();
+
+  // Pipe copy total bandwidth in GB/s
+  double perf = 2. * ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " %17s (%8d bytes) block size: %2d i:%4d (GB/s) ",
+           testName_.c_str(), bufSize_, typeSize[typeIdx_], numIter);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfPipeCopySpeed::close(void) {
+  if (srcBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(srcBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(srcBuffer_) failed");
+  }
+  if (pipe_[0]) {
+    error_ = _wrapper->clReleaseMemObject(pipe_[0]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(pipe_[0]) failed");
+  }
+  if (pipe_[1]) {
+    error_ = _wrapper->clReleaseMemObject(pipe_[1]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(pipe_[1]) failed");
+  }
+  if (dstBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(dstBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(dstBuffer_) failed");
+  }
+
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPipeCopySpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPipeCopySpeed.h
new file mode 100644
index 0000000000..e517399d36
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPipeCopySpeed.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PipeCopySpeed_H_
+#define _OCL_PipeCopySpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfPipeCopySpeed : public OCLTestImp {
+ public:
+  OCLPerfPipeCopySpeed();
+  virtual ~OCLPerfPipeCopySpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 100;
+  void setData(cl_mem buffer);
+  void checkData(cl_mem buffer);
+
+  cl_command_queue cmd_queue_;
+  cl_mem srcBuffer_;
+  cl_mem pipe_[2];
+  cl_mem dstBuffer_;
+  cl_program program_;
+  cl_kernel initPipe_;
+  cl_kernel copyPipe_;
+  cl_kernel readPipe_;
+
+  unsigned int bufSize_;
+  unsigned int typeIdx_;
+  unsigned int numElements;
+  unsigned int numIter;
+  unsigned int testIdx_;
+  std::string testName_;
+  bool subgroupSupport_;
+  bool failed_;
+};
+
+#endif  // _OCL_PipeCopySpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalRead.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalRead.cpp
new file mode 100644
index 0000000000..950958740b
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalRead.cpp
@@ -0,0 +1,549 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfProgramGlobalRead.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+static const unsigned int NUM_SIZES = 4;
+static const unsigned int NUM_READ_MODES = 6;
+// Limit to 32 reads for now
+static const unsigned int MAX_READ_MODES = 4;
+
+static const unsigned int NumReads[NUM_READ_MODES] = {1, 4, 16, 32, 64, 128};
+// 256KB, 1 MB, 4MB, 16 MB
+static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304,
+                                              16777216};
+static const unsigned int MaxTypes = 6;
+static unsigned int NumTypes = MaxTypes;
+static const char *types[MaxTypes] = {"char", "short", "int",
+                                      "long", "float", "double"};
+static unsigned int StartType = 0;
+static const unsigned int NumVecWidths =
+    3;  // 5; char8 global scope does not work; bug opened
+static const char *vecWidths[NumVecWidths] = {"", "2", "4"};  //, "8", "16"};
+static const unsigned int vecWidths_int[NumVecWidths] = {1, 2, 4};  //, 8, 16};
+static const unsigned int TypeSize[MaxTypes] = {
+    sizeof(cl_char), sizeof(cl_short), sizeof(cl_int),
+    sizeof(cl_long), sizeof(cl_float), sizeof(cl_double)};
+#define CHAR_BUF_SIZE 512
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+void OCLPerfProgramGlobalRead::genShader(unsigned int type,
+                                         unsigned int vecWidth,
+                                         unsigned int numReads,
+                                         unsigned int bufSize) {
+  char buf[CHAR_BUF_SIZE];
+
+  shader_.clear();
+  shader_ +=
+      "#ifdef USE_ARENA\n"
+      "#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\n"
+      "#endif\n";
+  shader_ +=
+      "#ifdef USE_AMD_DOUBLES\n"
+      "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
+      "#endif\n";
+  shader_ +=
+      "#ifdef USE_KHR_DOUBLES\n"
+      "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+      "#endif\n";
+  SNPRINTF(buf, CHAR_BUF_SIZE, "__global %s%s gp[%d];\n", types[type],
+           vecWidths[vecWidth], bufSize);
+  shader_.append(buf);
+  SNPRINTF(buf, CHAR_BUF_SIZE,
+           "__kernel void __attribute__((reqd_work_group_size(64,1,1))) "
+           "_ReadSpeed(__global %s%s * restrict outBuf, constant uint * "
+           "restrict constBuf)\n",
+           types[type], vecWidths[vecWidth]);
+  shader_.append(buf);
+  shader_ +=
+      "{\n"
+      "    uint i = (uint) get_global_id(0);\n";
+  if (numReads == 1) {
+    SNPRINTF(buf, CHAR_BUF_SIZE, "    %s%s temp = 0;\n", types[type],
+             vecWidths[vecWidth]);
+    shader_.append(buf);
+    shader_ +=
+        "    const unsigned int Max = constBuf[0];\n"
+        "    temp = *(gp + i % Max);\n";
+    shader_ +=
+        "    *(outBuf + i) = temp;\n"
+        "}\n";
+  } else {
+    SNPRINTF(buf, CHAR_BUF_SIZE, "    %s%s temp0 = 0;\n", types[type],
+             vecWidths[vecWidth]);
+    shader_.append(buf);
+    SNPRINTF(buf, CHAR_BUF_SIZE, "    %s%s temp1 = 0;\n", types[type],
+             vecWidths[vecWidth]);
+    shader_.append(buf);
+    SNPRINTF(buf, CHAR_BUF_SIZE, "    %s%s temp2 = 0;\n", types[type],
+             vecWidths[vecWidth]);
+    shader_.append(buf);
+    SNPRINTF(buf, CHAR_BUF_SIZE, "    %s%s temp3 = 0;\n", types[type],
+             vecWidths[vecWidth]);
+    shader_.append(buf);
+    shader_ +=
+        "    const unsigned int Max =  constBuf[0];\n"
+        "    unsigned int idx0 = (i % Max) +  constBuf[1];\n"
+        "    unsigned int idx1 = (i % Max) +  constBuf[2];\n"
+        "    unsigned int idx2 = (i % Max) +  constBuf[3];\n"
+        "    unsigned int idx3 = (i % Max) +  constBuf[4];\n";
+
+    for (unsigned int i = 0; i < (numReads >> 2); i++) {
+      shader_ += "    temp0 += *(gp + idx0);\n";
+      shader_ += "    temp1 += *(gp + idx1);\n";
+      shader_ += "    temp2 += *(gp + idx2);\n";
+      shader_ += "    temp3 += *(gp + idx3);\n";
+      shader_ += "    idx0 +=  constBuf[5];\n";
+      shader_ += "    idx1 +=  constBuf[5];\n";
+      shader_ += "    idx2 +=  constBuf[5];\n";
+      shader_ += "    idx3 +=  constBuf[5];\n";
+    }
+    shader_ +=
+        "    *(outBuf + i) = temp0 + temp1 + temp2 + temp3;\n"
+        "}\n";
+  }
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+OCLPerfProgramGlobalRead::OCLPerfProgramGlobalRead() {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  context_ = 0;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+    // Get last for default
+    platform = platforms[numPlatforms - 1];
+    for (unsigned i = 0; i < numPlatforms; ++i) {
+      char pbuf[100];
+      error_ = _wrapper->clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR,
+                                           sizeof(pbuf), pbuf, NULL);
+      num_devices = 0;
+      /* Get the number of requested devices */
+      error_ =
+          _wrapper->clGetDeviceIDs(platforms[i], type_, 0, NULL, &num_devices);
+      // Runtime returns an error when no GPU devices are present instead of
+      // just returning 0 devices
+      // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+      // Choose platform with GPU devices
+      if (num_devices > 0) {
+        platform = platforms[i];
+        break;
+      }
+    }
+    delete platforms;
+  }
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  char *p = strstr(charbuf, "cl_khr_byte_addressable_store");
+  char *p2 = strstr(charbuf, "cl_khr_fp64");
+
+  NumTypes = MaxTypes;
+  if (!p) {
+    // No arena ops
+    NumTypes -= 2;
+    StartType = 2;
+  }
+  if (!p2) {
+    // Doubles not supported
+    NumTypes--;
+  }
+  _numSubTests = NumTypes * NumVecWidths * NUM_SIZES * MAX_READ_MODES;
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  skip_ = false;
+}
+
+OCLPerfProgramGlobalRead::~OCLPerfProgramGlobalRead() {}
+
+// Fill with 1s of appropriate type
+void OCLPerfProgramGlobalRead::setData(cl_mem buffer, float val) {
+  void *ptr =
+      _wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_WRITE, 0,
+                                   bufSize_, 0, NULL, NULL, &error_);
+  switch (typeIdx_) {
+    case 0:  // char
+    {
+      char *data = (char *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(char)); i++)
+        data[i] = (char)val;
+      break;
+    }
+    case 1:  // short
+    {
+      short *data = (short *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(short)); i++)
+        data[i] = (short)val;
+      break;
+    }
+    case 2:  // int
+    {
+      int *data = (int *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(int)); i++)
+        data[i] = (int)val;
+      break;
+    }
+    case 3:  // long
+    {
+      cl_long *data = (cl_long *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(cl_long)); i++)
+        data[i] = (cl_long)val;
+      break;
+    }
+    case 4:  // float
+    {
+      float *data = (float *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(float)); i++)
+        data[i] = val;
+      break;
+    }
+    case 5:  // double
+    {
+      double *data = (double *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(double)); i++)
+        data[i] = (double)val;
+      break;
+    }
+    default:
+      // oops
+      break;
+  }
+  error_ =
+      _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL);
+}
+
+void OCLPerfProgramGlobalRead::checkData(cl_mem buffer) {
+  void *ptr =
+      _wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_READ, 0,
+                                   bufSize_, 0, NULL, NULL, &error_);
+  switch (typeIdx_) {
+    case 0:  // char
+    {
+      char *data = (char *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(char)); i++) {
+        if (data[i] != (char)numReads_) {
+          printf("Data validation failed at index %d!\n", i);
+          printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_,
+                 numReads_, numReads_, numReads_, (unsigned int)data[i],
+                 (unsigned int)data[i + 1], (unsigned int)data[i + 2],
+                 (unsigned int)data[i + 3]);
+          CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
+          break;
+        }
+      }
+      break;
+    }
+    case 1:  // short
+    {
+      short *data = (short *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(short)); i++) {
+        if (data[i] != (short)numReads_) {
+          printf("Data validation failed at index %d!\n", i);
+          printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_,
+                 numReads_, numReads_, numReads_, (unsigned int)data[i],
+                 (unsigned int)data[i + 1], (unsigned int)data[i + 2],
+                 (unsigned int)data[i + 3]);
+          CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
+          break;
+        }
+      }
+      break;
+    }
+    case 2:  // int
+    {
+      int *data = (int *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(int)); i++) {
+        if (data[i] != (int)numReads_) {
+          printf("Data validation failed at index %d!\n", i);
+          printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_,
+                 numReads_, numReads_, numReads_, (unsigned int)data[i],
+                 (unsigned int)data[i + 1], (unsigned int)data[i + 2],
+                 (unsigned int)data[i + 3]);
+          CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
+          break;
+        }
+      }
+      break;
+    }
+    case 3:  // long
+    {
+      cl_long *data = (cl_long *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(cl_long)); i++) {
+        if (data[i] != (cl_long)numReads_) {
+          printf("Data validation failed at index %d!\n", i);
+          printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_,
+                 numReads_, numReads_, numReads_, (unsigned int)data[i],
+                 (unsigned int)data[i + 1], (unsigned int)data[i + 2],
+                 (unsigned int)data[i + 3]);
+          CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
+          break;
+        }
+      }
+      break;
+    }
+    case 4:  // float
+    {
+      float *data = (float *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(float)); i++) {
+        if (data[i] != (float)numReads_) {
+          printf("Data validation failed at index %d!\n", i);
+          printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_,
+                 numReads_, numReads_, numReads_, (unsigned int)data[i],
+                 (unsigned int)data[i + 1], (unsigned int)data[i + 2],
+                 (unsigned int)data[i + 3]);
+          CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
+          break;
+        }
+      }
+      break;
+    }
+    case 5:  // double
+    {
+      double *data = (double *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(double)); i++) {
+        if (data[i] != (double)numReads_) {
+          printf("Data validation failed at index %d!\n", i);
+          printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_,
+                 numReads_, numReads_, numReads_, (unsigned int)data[i],
+                 (unsigned int)data[i + 1], (unsigned int)data[i + 2],
+                 (unsigned int)data[i + 3]);
+          CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
+          break;
+        }
+      }
+      break;
+    }
+    default:
+      // oops
+      break;
+  }
+  error_ =
+      _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL);
+}
+
+void OCLPerfProgramGlobalRead::open(unsigned int test, char *units,
+                                    double &conversion, unsigned int deviceId) {
+  error_ = CL_SUCCESS;
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  program_ = 0;
+  kernel_ = 0;
+  cmd_queue_ = 0;
+  outBuffer_ = 0;
+  constBuffer_ = 0;
+
+#if defined(CL_VERSION_2_0)
+  cl_device_id device;
+  numReads_ = NumReads[test % MAX_READ_MODES];
+  width_ = Sizes[(test / MAX_READ_MODES) % NUM_SIZES];
+  vecSizeIdx_ = (test / (MAX_READ_MODES * NUM_SIZES)) % NumVecWidths;
+  typeIdx_ = (test / (MAX_READ_MODES * NUM_SIZES * NumVecWidths)) % NumTypes +
+             StartType;
+
+  bufSize_ = width_;
+
+  cmd_queue_ = cmdQueues_[_deviceId];
+
+  device = devices_[_deviceId];
+
+  outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  constBuffer_ = _wrapper->clCreateBuffer(context_, 0, 16 * 2, NULL, &error_);
+  CHECK_RESULT(constBuffer_ == 0, "clCreateBuffer(constBuffer) failed");
+
+  genShader(typeIdx_, vecSizeIdx_, numReads_,
+            bufSize_ / (TypeSize[typeIdx_] * (1 << vecSizeIdx_)));
+  char *tmp = (char *)shader_.c_str();
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&tmp, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  std::string args;
+  args.clear();
+  if (typeIdx_ < 2) {
+    args += "-D USE_ARENA ";
+  }
+  args += "-cl-std=CL2.0";
+  error_ =
+      _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "_ReadSpeed", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer_);
+  error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem),
+                                    (void *)&constBuffer_);
+
+  setData(outBuffer_, 1.2345678f);
+  unsigned int *cBuf = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, constBuffer_, true, CL_MAP_WRITE, 0, 16 * 2, 0, NULL, NULL,
+      &error_);
+  // Force all wavefronts to fetch the same data.  We are looking for peak speed
+  // here.
+  cBuf[0] = 64;
+  // These values are chosen to assure there is no data reuse within a clause.
+  // If caching is not working, then the uncached numbers will be low.
+  cBuf[1] = 0;
+  cBuf[2] = 64;
+  cBuf[3] = 128;
+  cBuf[4] = 192;
+  cBuf[5] = 0;
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, constBuffer_, cBuf, 0,
+                                             NULL, NULL);
+  _wrapper->clFinish(cmd_queue_);
+#else
+  skip_ = true;
+  testDescString =
+      "Program scope globals not supported for < 2.0 builds. Test Skipped.";
+  return;
+#endif
+}
+
+void OCLPerfProgramGlobalRead::run(void) {
+  if (skip_) {
+    return;
+  }
+#if defined(CL_VERSION_2_0)
+  int global = bufSize_ / (TypeSize[typeIdx_] * (1 << vecSizeIdx_));
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  CPerfCounter timer;
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < NUM_ITER; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Program scope global read bandwidth in GB/s
+  double perf =
+      ((double)bufSize_ * numReads_ * NUM_ITER * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  char buf2[256];
+  SNPRINTF(buf, sizeof(buf), "%s%s", types[typeIdx_], vecWidths[vecSizeIdx_]);
+  SNPRINTF(buf2, sizeof(buf2), " %-8s (%8d) %2d reads: (GB/s) ", buf, width_,
+           numReads_);
+  testDescString = buf2;
+  // checkData(outBuffer_);
+#endif
+}
+
+unsigned int OCLPerfProgramGlobalRead::close(void) {
+#if defined(CL_VERSION_2_0)
+  if (cmd_queue_) _wrapper->clFinish(cmd_queue_);
+
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (constBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(constBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(constBuffer_) failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+#endif
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalRead.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalRead.h
new file mode 100644
index 0000000000..bef3e25985
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalRead.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PROGRAMGLOBALREAD_H
+#define _OCL_PROGRAMGLOBALREAD_H
+
+#include "OCLTestImp.h"
+
+class OCLPerfProgramGlobalRead : public OCLTestImp {
+ public:
+  OCLPerfProgramGlobalRead();
+  virtual ~OCLPerfProgramGlobalRead();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void genShader(unsigned int type, unsigned int vecWidth,
+                 unsigned int numReads, unsigned int bufSize);
+  void setData(cl_mem buffer, float data);
+  void checkData(cl_mem buffer);
+
+  static const unsigned int NUM_ITER = 100;
+
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_mem outBuffer_;
+  cl_mem constBuffer_;
+
+  unsigned int width_;
+  unsigned int bufSize_;
+  unsigned int vecSizeIdx_;
+  unsigned int numReads_;
+  unsigned int typeIdx_;
+
+  bool skip_;
+};
+
+#endif  // _OCL_PROGRAMGLOBALREAD_H
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalWrite.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalWrite.cpp
new file mode 100644
index 0000000000..a26d4caa24
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalWrite.cpp
@@ -0,0 +1,384 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfProgramGlobalWrite.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+static const unsigned int NUM_SIZES = 4;
+static const unsigned int NUM_READ_MODES = 6;
+// Limit to 32 reads for now
+static const unsigned int MAX_READ_MODES = 4;
+
+static const unsigned int NumReads[NUM_READ_MODES] = {1, 4, 16, 32, 64, 128};
+// 256KB, 1 MB, 4MB, 16 MB
+static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304,
+                                              16777216};
+static const unsigned int MaxTypes = 6;
+static unsigned int NumTypes = MaxTypes;
+static const char *types[MaxTypes] = {"char", "short", "int",
+                                      "long", "float", "double"};
+static unsigned int StartType = 0;
+static const unsigned int NumVecWidths =
+    3;  // 5; char8 global scope does not work; bug opened
+static const char *vecWidths[NumVecWidths] = {"", "2", "4"};  //, "8", "16"};
+static const unsigned int vecWidths_int[NumVecWidths] = {1, 2, 4};  //, 8, 16};
+static const unsigned int TypeSize[MaxTypes] = {
+    sizeof(cl_char), sizeof(cl_short), sizeof(cl_int),
+    sizeof(cl_long), sizeof(cl_float), sizeof(cl_double)};
+#define CHAR_BUF_SIZE 512
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+void OCLPerfProgramGlobalWrite::genShader(unsigned int type,
+                                          unsigned int vecWidth,
+                                          unsigned int numReads,
+                                          unsigned int bufSize) {
+  char buf[CHAR_BUF_SIZE];
+
+  shader_.clear();
+  shader_ +=
+      "#ifdef USE_ARENA\n"
+      "#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\n"
+      "#endif\n";
+  shader_ +=
+      "#ifdef USE_AMD_DOUBLES\n"
+      "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
+      "#endif\n";
+  shader_ +=
+      "#ifdef USE_KHR_DOUBLES\n"
+      "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+      "#endif\n";
+  SNPRINTF(buf, CHAR_BUF_SIZE, "__global %s%s gp[%d];\n", types[type],
+           vecWidths[vecWidth], bufSize);
+  shader_.append(buf);
+  SNPRINTF(buf, CHAR_BUF_SIZE,
+           "__kernel void __attribute__((reqd_work_group_size(64,1,1))) "
+           "_WriteSpeed(constant uint * restrict constBuf)\n");
+  shader_.append(buf);
+  shader_ +=
+      "{\n"
+      "    uint i = (uint) get_global_id(0);\n";
+  if (numReads == 1) {
+    SNPRINTF(buf, CHAR_BUF_SIZE, "    %s%s temp = 0;\n", types[type],
+             vecWidths[vecWidth]);
+    shader_.append(buf);
+    shader_ += "    const unsigned int Max = constBuf[0];\n";
+    shader_ +=
+        "    *(gp + i % Max) = 0;\n"
+        "}\n";
+  } else {
+    SNPRINTF(buf, CHAR_BUF_SIZE, "    %s%s temp0 = 0;\n", types[type],
+             vecWidths[vecWidth]);
+    shader_.append(buf);
+    SNPRINTF(buf, CHAR_BUF_SIZE, "    %s%s temp1 = 0;\n", types[type],
+             vecWidths[vecWidth]);
+    shader_.append(buf);
+    SNPRINTF(buf, CHAR_BUF_SIZE, "    %s%s temp2 = 0;\n", types[type],
+             vecWidths[vecWidth]);
+    shader_.append(buf);
+    SNPRINTF(buf, CHAR_BUF_SIZE, "    %s%s temp3 = 0;\n", types[type],
+             vecWidths[vecWidth]);
+    shader_.append(buf);
+    shader_ +=
+        "    const unsigned int Max = constBuf[0];\n"
+        "    unsigned int idx0 = (i % Max) + constBuf[1];\n"
+        "    unsigned int idx1 = (i % Max) + constBuf[2];\n"
+        "    unsigned int idx2 = (i % Max) + constBuf[3];\n"
+        "    unsigned int idx3 = (i % Max) + constBuf[4];\n";
+
+    for (unsigned int i = 0; i < (numReads >> 2); i++) {
+      shader_ += "    *(gp + idx0) = idx0;\n";
+      shader_ += "    *(gp + idx1) = idx1;\n";
+      shader_ += "    *(gp + idx2) = idx2;\n";
+      shader_ += "    *(gp + idx3) = idx3;\n";
+      shader_ += "    idx0 += constBuf[5];\n";
+      shader_ += "    idx1 += constBuf[5];\n";
+      shader_ += "    idx2 += constBuf[5];\n";
+      shader_ += "    idx3 += constBuf[5];\n";
+    }
+    shader_ += "}\n";
+  }
+  SNPRINTF(buf, CHAR_BUF_SIZE, "__kernel void __dummyRead(global %s%s *in)\n",
+           types[type], vecWidths[vecWidth]);
+  shader_.append(buf);
+  shader_ +=
+      "{\n"
+      "    uint i = (uint) get_global_id(0);\n";
+  SNPRINTF(buf, CHAR_BUF_SIZE, "    in[i] = gp[i];\n");
+  shader_.append(buf);
+  shader_ += "}\n";
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+OCLPerfProgramGlobalWrite::OCLPerfProgramGlobalWrite() {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  context_ = 0;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+    // Get last for default
+    platform = platforms[numPlatforms - 1];
+    for (unsigned i = 0; i < numPlatforms; ++i) {
+      char pbuf[100];
+      error_ = _wrapper->clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR,
+                                           sizeof(pbuf), pbuf, NULL);
+      num_devices = 0;
+      /* Get the number of requested devices */
+      error_ =
+          _wrapper->clGetDeviceIDs(platforms[i], type_, 0, NULL, &num_devices);
+      // Runtime returns an error when no GPU devices are present instead of
+      // just returning 0 devices
+      // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+      // Choose platform with GPU devices
+      if (num_devices > 0) {
+        platform = platforms[i];
+        break;
+      }
+    }
+    delete platforms;
+  }
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  char *p = strstr(charbuf, "cl_khr_byte_addressable_store");
+  char *p2 = strstr(charbuf, "cl_khr_fp64");
+
+  NumTypes = MaxTypes;
+  if (!p) {
+    // No arena ops
+    NumTypes -= 2;
+    StartType = 2;
+  }
+  if (!p2) {
+    // Doubles not supported
+    NumTypes--;
+  }
+  _numSubTests = NumTypes * NumVecWidths * NUM_SIZES * MAX_READ_MODES;
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  skip_ = false;
+}
+
+OCLPerfProgramGlobalWrite::~OCLPerfProgramGlobalWrite() {}
+
+void OCLPerfProgramGlobalWrite::open(unsigned int test, char *units,
+                                     double &conversion,
+                                     unsigned int deviceId) {
+  error_ = CL_SUCCESS;
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  program_ = 0;
+  kernel_ = 0;
+  cmd_queue_ = 0;
+  outBuffer_ = 0;
+  constBuffer_ = 0;
+
+#if defined(CL_VERSION_2_0)
+  cl_device_id device;
+  numReads_ = NumReads[test % MAX_READ_MODES];
+  width_ = Sizes[(test / MAX_READ_MODES) % NUM_SIZES];
+  vecSizeIdx_ = (test / (MAX_READ_MODES * NUM_SIZES)) % NumVecWidths;
+  typeIdx_ = (test / (MAX_READ_MODES * NUM_SIZES * NumVecWidths)) % NumTypes +
+             StartType;
+
+  bufSize_ = width_;
+
+  cmd_queue_ = cmdQueues_[_deviceId];
+
+  device = devices_[_deviceId];
+
+  outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  constBuffer_ = _wrapper->clCreateBuffer(context_, 0, 16 * 2, NULL, &error_);
+  CHECK_RESULT(constBuffer_ == 0, "clCreateBuffer(constBuffer) failed");
+
+  genShader(typeIdx_, vecSizeIdx_, numReads_,
+            bufSize_ / (TypeSize[typeIdx_] * (1 << vecSizeIdx_)));
+  char *tmp = (char *)shader_.c_str();
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&tmp, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  std::string args;
+  args.clear();
+  if (typeIdx_ < 2) {
+    args += "-D USE_ARENA ";
+  }
+  args += "-cl-std=CL2.0";
+  error_ =
+      _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "_WriteSpeed", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                    (void *)&constBuffer_);
+
+  unsigned int *cBuf = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, constBuffer_, true, CL_MAP_WRITE, 0, 16 * 2, 0, NULL, NULL,
+      &error_);
+  // Force all wavefronts to fetch the same data.  We are looking for peak speed
+  // here.
+  cBuf[0] = 64;
+  // These values are chosen to assure there is no data reuse within a clause.
+  // If caching is not working, then the uncached numbers will be low.
+  cBuf[1] = 0;
+  cBuf[2] = 64;
+  cBuf[3] = 128;
+  cBuf[4] = 192;
+  cBuf[5] = 0;
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, constBuffer_, cBuf, 0,
+                                             NULL, NULL);
+  _wrapper->clFinish(cmd_queue_);
+#else
+  skip_ = true;
+  testDescString =
+      "Program scope globals not supported for < 2.0 builds. Test Skipped.";
+  return;
+#endif
+}
+
+void OCLPerfProgramGlobalWrite::run(void) {
+  if (skip_) {
+    return;
+  }
+#if defined(CL_VERSION_2_0)
+  int global = bufSize_ / (TypeSize[typeIdx_] * (1 << vecSizeIdx_));
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  CPerfCounter timer;
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < NUM_ITER; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Program scope global write bandwidth in GB/s
+  double perf =
+      ((double)bufSize_ * numReads_ * NUM_ITER * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  char buf2[256];
+  SNPRINTF(buf, sizeof(buf), "%s%s", types[typeIdx_], vecWidths[vecSizeIdx_]);
+  SNPRINTF(buf2, sizeof(buf2), " %-8s (%8d) %2d reads: (GB/s) ", buf, width_,
+           numReads_);
+  testDescString = buf2;
+#endif
+}
+
+unsigned int OCLPerfProgramGlobalWrite::close(void) {
+#if defined(CL_VERSION_2_0)
+  if (cmd_queue_) _wrapper->clFinish(cmd_queue_);
+
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (constBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(constBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(constBuffer_) failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+#endif
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalWrite.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalWrite.h
new file mode 100644
index 0000000000..6102bb7428
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalWrite.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PROGRAMGLOBALWRITE_H_
+#define _OCL_PROGRAMGLOBALWRITE_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfProgramGlobalWrite : public OCLTestImp {
+ public:
+  OCLPerfProgramGlobalWrite();
+  virtual ~OCLPerfProgramGlobalWrite();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void genShader(unsigned int type, unsigned int vecWidth,
+                 unsigned int numReads, unsigned int bufSize);
+
+  static const unsigned int NUM_ITER = 100;
+
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_mem outBuffer_;
+  cl_mem constBuffer_;
+
+  unsigned int width_;
+  unsigned int bufSize_;
+  unsigned int vecSizeIdx_;
+  unsigned int numReads_;
+  unsigned int typeIdx_;
+
+  bool skip_;
+};
+
+#endif  // _OCL_PROGRAMGLOBALWRITE_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSHA256.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSHA256.cpp
new file mode 100644
index 0000000000..9cea4518d8
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSHA256.cpp
@@ -0,0 +1,841 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfSHA256.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+static const char *sha256_kernel =
+    "typedef uint UINT;\n"
+    "\n"
+    "#define VECTOR_LEN 1\n"
+    "\n"
+    "#ifdef LITTLE_E\n"
+    "\n"
+    "inline UINT byteswap(UINT x)\n"
+    "{\n"
+    "	UINT res = 0;\n"
+    "	\n"
+    "	for (uint i=0; i<4; i++)\n"
+    "	{\n"
+    "		res <<= 8;\n"
+    "		res |= (x & 0xff);\n"
+    "		x >>= 8;\n"
+    "	}\n"
+    "	\n"
+    "	return res;\n"
+    "}\n"
+    "\n"
+    "#else\n"
+    "\n"
+    "inline UINT byteswap(const UINT x)\n"
+    "{\n"
+    "	return x;\n"
+    "}\n"
+    "\n"
+    "#endif\n"
+    "\n"
+    "\n"
+    "void sha256_step( const UINT data[16], UINT *state )\n"
+    "{\n"
+    "   UINT W[64], temp1, temp2;\n"
+    "   UINT A, B, C, D, E, F, G, H;\n"
+    "\n"
+    "   for( int i = 0; i < 16; i++)\n"
+    "   {\n"
+    "      W[i] = byteswap(data[i]);\n"
+    "   }\n"
+    "\n"
+    "#define SHR(x,n)  ((x & 0xFFFFFFFF) >> n)\n"
+    "#define ROTR(x,n) (SHR(x,n) | (x << (32 - n)))\n"
+    "\n"
+    "#define S0(x) (ROTR(x, 7) ^ ROTR(x,18) ^  SHR(x, 3))\n"
+    "#define S1(x) (ROTR(x,17) ^ ROTR(x,19) ^  SHR(x,10))\n"
+    "\n"
+    "#define S2(x) (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))\n"
+    "#define S3(x) (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25))\n"
+    "\n"
+    "#define F0(x,y,z) ((x & y) | (z & (x | y)))\n"
+    "#define F1(x,y,z) (z ^ (x & (y ^ z)))\n"
+    "\n"
+    "#define R(t)                                    \\\n"
+    "(                                               \\\n"
+    "    W[t] = S1(W[t -  2]) + W[t -  7] +          \\\n"
+    "           S0(W[t - 15]) + W[t - 16]            \\\n"
+    ")\n"
+    "\n"
+    "#define P(a,b,c,d,e,f,g,h,x,K)                  \\\n"
+    "{                                               \\\n"
+    "    temp1 = h + S3(e) + F1(e,f,g) + K + x;      \\\n"
+    "    temp2 = S2(a) + F0(a,b,c);                  \\\n"
+    "    d += temp1; h = temp1 + temp2;              \\\n"
+    "}\n"
+    "\n"
+    "    A = state[0];\n"
+    "    B = state[1];\n"
+    "    C = state[2];\n"
+    "    D = state[3];\n"
+    "    E = state[4];\n"
+    "    F = state[5];\n"
+    "    G = state[6];\n"
+    "    H = state[7];\n"
+    "\n"
+    "    P( A, B, C, D, E, F, G, H, W[ 0], 0x428A2F98 );\n"
+    "    P( H, A, B, C, D, E, F, G, W[ 1], 0x71374491 );\n"
+    "    P( G, H, A, B, C, D, E, F, W[ 2], 0xB5C0FBCF );\n"
+    "    P( F, G, H, A, B, C, D, E, W[ 3], 0xE9B5DBA5 );\n"
+    "    P( E, F, G, H, A, B, C, D, W[ 4], 0x3956C25B );\n"
+    "    P( D, E, F, G, H, A, B, C, W[ 5], 0x59F111F1 );\n"
+    "    P( C, D, E, F, G, H, A, B, W[ 6], 0x923F82A4 );\n"
+    "    P( B, C, D, E, F, G, H, A, W[ 7], 0xAB1C5ED5 );\n"
+    "    P( A, B, C, D, E, F, G, H, W[ 8], 0xD807AA98 );\n"
+    "    P( H, A, B, C, D, E, F, G, W[ 9], 0x12835B01 );\n"
+    "    P( G, H, A, B, C, D, E, F, W[10], 0x243185BE );\n"
+    "    P( F, G, H, A, B, C, D, E, W[11], 0x550C7DC3 );\n"
+    "    P( E, F, G, H, A, B, C, D, W[12], 0x72BE5D74 );\n"
+    "    P( D, E, F, G, H, A, B, C, W[13], 0x80DEB1FE );\n"
+    "    P( C, D, E, F, G, H, A, B, W[14], 0x9BDC06A7 );\n"
+    "    P( B, C, D, E, F, G, H, A, W[15], 0xC19BF174 );\n"
+    "    P( A, B, C, D, E, F, G, H, R(16), 0xE49B69C1 );\n"
+    "    P( H, A, B, C, D, E, F, G, R(17), 0xEFBE4786 );\n"
+    "    P( G, H, A, B, C, D, E, F, R(18), 0x0FC19DC6 );\n"
+    "    P( F, G, H, A, B, C, D, E, R(19), 0x240CA1CC );\n"
+    "    P( E, F, G, H, A, B, C, D, R(20), 0x2DE92C6F );\n"
+    "    P( D, E, F, G, H, A, B, C, R(21), 0x4A7484AA );\n"
+    "    P( C, D, E, F, G, H, A, B, R(22), 0x5CB0A9DC );\n"
+    "    P( B, C, D, E, F, G, H, A, R(23), 0x76F988DA );\n"
+    "    P( A, B, C, D, E, F, G, H, R(24), 0x983E5152 );\n"
+    "    P( H, A, B, C, D, E, F, G, R(25), 0xA831C66D );\n"
+    "    P( G, H, A, B, C, D, E, F, R(26), 0xB00327C8 );\n"
+    "    P( F, G, H, A, B, C, D, E, R(27), 0xBF597FC7 );\n"
+    "    P( E, F, G, H, A, B, C, D, R(28), 0xC6E00BF3 );\n"
+    "    P( D, E, F, G, H, A, B, C, R(29), 0xD5A79147 );\n"
+    "    P( C, D, E, F, G, H, A, B, R(30), 0x06CA6351 );\n"
+    "    P( B, C, D, E, F, G, H, A, R(31), 0x14292967 );\n"
+    "    P( A, B, C, D, E, F, G, H, R(32), 0x27B70A85 );\n"
+    "    P( H, A, B, C, D, E, F, G, R(33), 0x2E1B2138 );\n"
+    "    P( G, H, A, B, C, D, E, F, R(34), 0x4D2C6DFC );\n"
+    "    P( F, G, H, A, B, C, D, E, R(35), 0x53380D13 );\n"
+    "    P( E, F, G, H, A, B, C, D, R(36), 0x650A7354 );\n"
+    "    P( D, E, F, G, H, A, B, C, R(37), 0x766A0ABB );\n"
+    "    P( C, D, E, F, G, H, A, B, R(38), 0x81C2C92E );\n"
+    "    P( B, C, D, E, F, G, H, A, R(39), 0x92722C85 );\n"
+    "    P( A, B, C, D, E, F, G, H, R(40), 0xA2BFE8A1 );\n"
+    "    P( H, A, B, C, D, E, F, G, R(41), 0xA81A664B );\n"
+    "    P( G, H, A, B, C, D, E, F, R(42), 0xC24B8B70 );\n"
+    "    P( F, G, H, A, B, C, D, E, R(43), 0xC76C51A3 );\n"
+    "    P( E, F, G, H, A, B, C, D, R(44), 0xD192E819 );\n"
+    "    P( D, E, F, G, H, A, B, C, R(45), 0xD6990624 );\n"
+    "    P( C, D, E, F, G, H, A, B, R(46), 0xF40E3585 );\n"
+    "    P( B, C, D, E, F, G, H, A, R(47), 0x106AA070 );\n"
+    "    P( A, B, C, D, E, F, G, H, R(48), 0x19A4C116 );\n"
+    "    P( H, A, B, C, D, E, F, G, R(49), 0x1E376C08 );\n"
+    "    P( G, H, A, B, C, D, E, F, R(50), 0x2748774C );\n"
+    "    P( F, G, H, A, B, C, D, E, R(51), 0x34B0BCB5 );\n"
+    "    P( E, F, G, H, A, B, C, D, R(52), 0x391C0CB3 );\n"
+    "    P( D, E, F, G, H, A, B, C, R(53), 0x4ED8AA4A );\n"
+    "    P( C, D, E, F, G, H, A, B, R(54), 0x5B9CCA4F );\n"
+    "    P( B, C, D, E, F, G, H, A, R(55), 0x682E6FF3 );\n"
+    "    P( A, B, C, D, E, F, G, H, R(56), 0x748F82EE );\n"
+    "    P( H, A, B, C, D, E, F, G, R(57), 0x78A5636F );\n"
+    "    P( G, H, A, B, C, D, E, F, R(58), 0x84C87814 );\n"
+    "    P( F, G, H, A, B, C, D, E, R(59), 0x8CC70208 );\n"
+    "    P( E, F, G, H, A, B, C, D, R(60), 0x90BEFFFA );\n"
+    "    P( D, E, F, G, H, A, B, C, R(61), 0xA4506CEB );\n"
+    "    P( C, D, E, F, G, H, A, B, R(62), 0xBEF9A3F7 );\n"
+    "    P( B, C, D, E, F, G, H, A, R(63), 0xC67178F2 );\n"
+    "\n"
+    "    state[0] += A;\n"
+    "    state[1] += B;\n"
+    "    state[2] += C;\n"
+    "    state[3] += D;\n"
+    "    state[4] += E;\n"
+    "    state[5] += F;\n"
+    "    state[6] += G;\n"
+    "    state[7] += H;\n"
+    "}\n"
+    "\n"
+    "\n"
+    "#define choose_temp(x) ((x)/16)\n"
+    "\n"
+    "#define STORE_TO_TEMP(i) tb[((i)/16)][((i)%16)]\n"
+    "\n"
+    "\n"
+    "__kernel void CryptThread(__global const uint *buffer, __global uint "
+    "*state, const uint blockLen, const uint foo)\n"
+    "{\n"
+    "	const uint init[8] = {\n"
+    "		0x6a09e667,\n"
+    "		0xbb67ae85,\n"
+    "		0x3c6ef372,\n"
+    "		0xa54ff53a,\n"
+    "		0x510e527f,\n"
+    "		0x9b05688c,\n"
+    "		0x1f83d9ab,\n"
+    "		0x5be0cd19\n"
+    "	};\n"
+    "	\n"
+    "	const uint id = get_global_id(0);\n"
+    "	uint len = blockLen;\n"
+    "	uint i, j;\n"
+    "	const uint startPosInDWORDs = (len*id*foo)/4;\n"
+    "	const uint msgLenInBitsl = len * 8;\n"
+    "	const uint msgLenInBitsh = (len) >> (32-3);\n"
+    "	UINT localState[8];\n"
+    "\n"
+    "	for (j=0; j<8; j++) {\n"
+    "		localState[j] = init[j];\n"
+    "	}\n"
+    "\n"
+    "	i = 0;\n"
+    "	while (len >=64)\n"
+    "	{\n"
+    "		UINT data[16];\n"
+    "		for (j=0; j<16; j++) {\n"
+    "			data[j] = buffer[j + startPosInDWORDs + i];\n"
+    "		}\n"
+    "\n"
+    "		sha256_step(data, localState);\n"
+    "		i += 16;\n"
+    "		len -= 64;\n"
+    "	}\n"
+    "\n"
+    "	len /= 4;\n"
+    "\n"
+    "	UINT tb[2][16];\n"
+    "\n"
+    "	for (j=0; j<len; j++) \n"
+    "	{\n"
+    "		STORE_TO_TEMP(j) = buffer[j + startPosInDWORDs + i];\n"
+    "	}\n"
+    "\n"
+    "#ifdef LITTLE_E\n"
+    "	STORE_TO_TEMP(len) = 0x80;\n"
+    "#else\n"
+    "	STORE_TO_TEMP(len) = byteswap(0x80000000);\n"
+    "#endif\n"
+    "\n"
+    "	i = len+1;\n"
+    "\n"
+    "	while ((i % (512/32)) != (448/32))\n"
+    "	{\n"
+    "		STORE_TO_TEMP(i) = 0;\n"
+    "		i++;\n"
+    "	}\n"
+    "\n"
+    "#ifdef LITTLE_E\n"
+    "	{\n"
+    "		STORE_TO_TEMP(i) = byteswap(msgLenInBitsh);\n"
+    "		STORE_TO_TEMP(i + 1) = byteswap(msgLenInBitsl);\n"
+    "		i += 2;\n"
+    "	}\n"
+    "\n"
+    "#else\n"
+    "#endif\n"
+    "	\n"
+    "	sha256_step(tb[0], localState);\n"
+    "	if (32 == i)\n"
+    "	{\n"
+    "		sha256_step(tb[1], localState);\n"
+    "	}\n"
+    "	\n"
+    "	for (j=0; j<8; j++)\n"
+    "	{\n"
+    "		state[id*8 + j] = localState[j];\n"
+    "	}\n"
+    "}\n";
+
+static const char *sha256_opt_kernel =
+    "typedef uint UINT;\n"
+    "\n"
+    "#define VECTOR_LEN 1\n"
+    "\n"
+    "#ifdef LITTLE_E\n"
+    "\n"
+    "inline UINT byteswap(UINT x)\n"
+    "{\n"
+    "	UINT res = 0;\n"
+    "	\n"
+    "	for (uint i=0; i<4; i++)\n"
+    "	{\n"
+    "		res <<= 8;\n"
+    "		res |= (x & 0xff);\n"
+    "		x >>= 8;\n"
+    "	}\n"
+    "	\n"
+    "	return res;\n"
+    "}\n"
+    "\n"
+    "#else\n"
+    "\n"
+    "inline UINT byteswap(const UINT x)\n"
+    "{\n"
+    "	return x;\n"
+    "}\n"
+    "\n"
+    "#endif\n"
+    "\n"
+    "\n"
+    "void sha256_step( const UINT data[16], UINT *state )\n"
+    "{\n"
+    "   UINT W[64], temp1, temp2;\n"
+    "   UINT A, B, C, D, E, F, G, H;\n"
+    "\n"
+    "   for( int i = 0; i < 16; i++)\n"
+    "   {\n"
+    "      W[i] = byteswap(data[i]);\n"
+    "   }\n"
+    "\n"
+    "#define SHR(x,n)  ((x & 0xFFFFFFFF) >> n)\n"
+    "#define ROTR(x,n) (SHR(x,n) | (x << (32 - n)))\n"
+    "\n"
+    "#define S0(x) (ROTR(x, 7) ^ ROTR(x,18) ^  SHR(x, 3))\n"
+    "#define S1(x) (ROTR(x,17) ^ ROTR(x,19) ^  SHR(x,10))\n"
+    "\n"
+    "#define S2(x) (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))\n"
+    "#define S3(x) (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25))\n"
+    "\n"
+    "#define F0(x,y,z) ((x & y) | (z & (x | y)))\n"
+    "#define F1(x,y,z) (z ^ (x & (y ^ z)))\n"
+    "\n"
+    "#define R(t)                                    \\\n"
+    "(                                               \\\n"
+    "    W[t] = S1(W[t -  2]) + W[t -  7] +          \\\n"
+    "           S0(W[t - 15]) + W[t - 16]            \\\n"
+    ")\n"
+    "\n"
+    "#define P(a,b,c,d,e,f,g,h,x,K)                  \\\n"
+    "{                                               \\\n"
+    "    temp1 = h + S3(e) + F1(e,f,g) + K + x;      \\\n"
+    "    temp2 = S2(a) + F0(a,b,c);                  \\\n"
+    "    d += temp1; h = temp1 + temp2;              \\\n"
+    "}\n"
+    "\n"
+    "    A = state[0];\n"
+    "    B = state[1];\n"
+    "    C = state[2];\n"
+    "    D = state[3];\n"
+    "    E = state[4];\n"
+    "    F = state[5];\n"
+    "    G = state[6];\n"
+    "    H = state[7];\n"
+    "\n"
+    "    P( A, B, C, D, E, F, G, H, W[ 0], 0x428A2F98 );\n"
+    "    P( H, A, B, C, D, E, F, G, W[ 1], 0x71374491 );\n"
+    "    P( G, H, A, B, C, D, E, F, W[ 2], 0xB5C0FBCF );\n"
+    "    P( F, G, H, A, B, C, D, E, W[ 3], 0xE9B5DBA5 );\n"
+    "    P( E, F, G, H, A, B, C, D, W[ 4], 0x3956C25B );\n"
+    "    P( D, E, F, G, H, A, B, C, W[ 5], 0x59F111F1 );\n"
+    "    P( C, D, E, F, G, H, A, B, W[ 6], 0x923F82A4 );\n"
+    "    P( B, C, D, E, F, G, H, A, W[ 7], 0xAB1C5ED5 );\n"
+    "    P( A, B, C, D, E, F, G, H, W[ 8], 0xD807AA98 );\n"
+    "    P( H, A, B, C, D, E, F, G, W[ 9], 0x12835B01 );\n"
+    "    P( G, H, A, B, C, D, E, F, W[10], 0x243185BE );\n"
+    "    P( F, G, H, A, B, C, D, E, W[11], 0x550C7DC3 );\n"
+    "    P( E, F, G, H, A, B, C, D, W[12], 0x72BE5D74 );\n"
+    "    P( D, E, F, G, H, A, B, C, W[13], 0x80DEB1FE );\n"
+    "    P( C, D, E, F, G, H, A, B, W[14], 0x9BDC06A7 );\n"
+    "    P( B, C, D, E, F, G, H, A, W[15], 0xC19BF174 );\n"
+    "    P( A, B, C, D, E, F, G, H, R(16), 0xE49B69C1 );\n"
+    "    P( H, A, B, C, D, E, F, G, R(17), 0xEFBE4786 );\n"
+    "    P( G, H, A, B, C, D, E, F, R(18), 0x0FC19DC6 );\n"
+    "    P( F, G, H, A, B, C, D, E, R(19), 0x240CA1CC );\n"
+    "    P( E, F, G, H, A, B, C, D, R(20), 0x2DE92C6F );\n"
+    "    P( D, E, F, G, H, A, B, C, R(21), 0x4A7484AA );\n"
+    "    P( C, D, E, F, G, H, A, B, R(22), 0x5CB0A9DC );\n"
+    "    P( B, C, D, E, F, G, H, A, R(23), 0x76F988DA );\n"
+    "    P( A, B, C, D, E, F, G, H, R(24), 0x983E5152 );\n"
+    "    P( H, A, B, C, D, E, F, G, R(25), 0xA831C66D );\n"
+    "    P( G, H, A, B, C, D, E, F, R(26), 0xB00327C8 );\n"
+    "    P( F, G, H, A, B, C, D, E, R(27), 0xBF597FC7 );\n"
+    "    P( E, F, G, H, A, B, C, D, R(28), 0xC6E00BF3 );\n"
+    "    P( D, E, F, G, H, A, B, C, R(29), 0xD5A79147 );\n"
+    "    P( C, D, E, F, G, H, A, B, R(30), 0x06CA6351 );\n"
+    "    P( B, C, D, E, F, G, H, A, R(31), 0x14292967 );\n"
+    "    P( A, B, C, D, E, F, G, H, R(32), 0x27B70A85 );\n"
+    "    P( H, A, B, C, D, E, F, G, R(33), 0x2E1B2138 );\n"
+    "    P( G, H, A, B, C, D, E, F, R(34), 0x4D2C6DFC );\n"
+    "    P( F, G, H, A, B, C, D, E, R(35), 0x53380D13 );\n"
+    "    P( E, F, G, H, A, B, C, D, R(36), 0x650A7354 );\n"
+    "    P( D, E, F, G, H, A, B, C, R(37), 0x766A0ABB );\n"
+    "    P( C, D, E, F, G, H, A, B, R(38), 0x81C2C92E );\n"
+    "    P( B, C, D, E, F, G, H, A, R(39), 0x92722C85 );\n"
+    "    P( A, B, C, D, E, F, G, H, R(40), 0xA2BFE8A1 );\n"
+    "    P( H, A, B, C, D, E, F, G, R(41), 0xA81A664B );\n"
+    "    P( G, H, A, B, C, D, E, F, R(42), 0xC24B8B70 );\n"
+    "    P( F, G, H, A, B, C, D, E, R(43), 0xC76C51A3 );\n"
+    "    P( E, F, G, H, A, B, C, D, R(44), 0xD192E819 );\n"
+    "    P( D, E, F, G, H, A, B, C, R(45), 0xD6990624 );\n"
+    "    P( C, D, E, F, G, H, A, B, R(46), 0xF40E3585 );\n"
+    "    P( B, C, D, E, F, G, H, A, R(47), 0x106AA070 );\n"
+    "    P( A, B, C, D, E, F, G, H, R(48), 0x19A4C116 );\n"
+    "    P( H, A, B, C, D, E, F, G, R(49), 0x1E376C08 );\n"
+    "    P( G, H, A, B, C, D, E, F, R(50), 0x2748774C );\n"
+    "    P( F, G, H, A, B, C, D, E, R(51), 0x34B0BCB5 );\n"
+    "    P( E, F, G, H, A, B, C, D, R(52), 0x391C0CB3 );\n"
+    "    P( D, E, F, G, H, A, B, C, R(53), 0x4ED8AA4A );\n"
+    "    P( C, D, E, F, G, H, A, B, R(54), 0x5B9CCA4F );\n"
+    "    P( B, C, D, E, F, G, H, A, R(55), 0x682E6FF3 );\n"
+    "    P( A, B, C, D, E, F, G, H, R(56), 0x748F82EE );\n"
+    "    P( H, A, B, C, D, E, F, G, R(57), 0x78A5636F );\n"
+    "    P( G, H, A, B, C, D, E, F, R(58), 0x84C87814 );\n"
+    "    P( F, G, H, A, B, C, D, E, R(59), 0x8CC70208 );\n"
+    "    P( E, F, G, H, A, B, C, D, R(60), 0x90BEFFFA );\n"
+    "    P( D, E, F, G, H, A, B, C, R(61), 0xA4506CEB );\n"
+    "    P( C, D, E, F, G, H, A, B, R(62), 0xBEF9A3F7 );\n"
+    "    P( B, C, D, E, F, G, H, A, R(63), 0xC67178F2 );\n"
+    "\n"
+    "    state[0] += A;\n"
+    "    state[1] += B;\n"
+    "    state[2] += C;\n"
+    "    state[3] += D;\n"
+    "    state[4] += E;\n"
+    "    state[5] += F;\n"
+    "    state[6] += G;\n"
+    "    state[7] += H;\n"
+    "}\n"
+    "\n"
+    "\n"
+    "#define choose_temp(x) ((x)/16)\n"
+    "\n"
+    "#define STORE_TO_TEMP(i) tb[((i)/16)][((i)%16)]\n"
+    "\n"
+    "#define WAVEFRONT_SIZE 64\n"
+    "\n"
+    "__kernel void CryptThread(__global const uint *buffer, __global uint "
+    "*state, const uint blockLen, const uint foo)\n"
+    "{\n"
+    "	const uint init[8] = {\n"
+    "		0x6a09e667,\n"
+    "		0xbb67ae85,\n"
+    "		0x3c6ef372,\n"
+    "		0xa54ff53a,\n"
+    "		0x510e527f,\n"
+    "		0x9b05688c,\n"
+    "		0x1f83d9ab,\n"
+    "		0x5be0cd19\n"
+    "	};\n"
+    "	\n"
+    "	const uint id = get_global_id(0);\n"
+    "	const uint lid = get_local_id(0);\n"
+    "	uint len = blockLen;\n"
+    "	uint i, j;\n"
+    "	const uint startPosInDWORDs = (len*id*foo)/4;\n"
+    "uint blockStartInDWORDs = (len*(id / WAVEFRONT_SIZE)*WAVEFRONT_SIZE)/4;\n"
+    "	const uint msgLenInBitsl = len * 8;\n"
+    "	const uint msgLenInBitsh = (len) >> (32-3);\n"
+    "	UINT localState[8];\n"
+    "\n"
+    "	for (j=0; j<8; j++) {\n"
+    "		localState[j] = init[j];\n"
+    "	}\n"
+    "\n"
+    "	i = 0;\n"
+    "	while (len >=64)\n"
+    "	{\n"
+    "		UINT data[16];\n"
+    "		for (j=0; j<16; j++) {\n"
+    "			//data[j] = buffer[j + startPosInDWORDs + i];\n"
+    "			data[j] = buffer[j*WAVEFRONT_SIZE + blockStartInDWORDs "
+    "+ i*WAVEFRONT_SIZE + lid];\n"
+    "		}\n"
+    "\n"
+    "		sha256_step(data, localState);\n"
+    "		i += 16;\n"
+    "		len -= 64;\n"
+    "	}\n"
+    "\n"
+    "	len /= 4;\n"
+    "\n"
+    "	UINT tb[2][16];\n"
+    "\n"
+    "	for (j=0; j<len; j++) \n"
+    "	{\n"
+    "		//STORE_TO_TEMP(j) = buffer[j + startPosInDWORDs + i];\n"
+    "			STORE_TO_TEMP(j) = buffer[j*WAVEFRONT_SIZE + "
+    "blockStartInDWORDs + i*WAVEFRONT_SIZE + lid];\n"
+    "	}\n"
+    "\n"
+    "#ifdef LITTLE_E\n"
+    "	STORE_TO_TEMP(len) = 0x80;\n"
+    "#else\n"
+    "	STORE_TO_TEMP(len) = byteswap(0x80000000);\n"
+    "#endif\n"
+    "\n"
+    "	i = len+1;\n"
+    "\n"
+    "	while ((i % (512/32)) != (448/32))\n"
+    "	{\n"
+    "		STORE_TO_TEMP(i) = 0;\n"
+    "		i++;\n"
+    "	}\n"
+    "\n"
+    "#ifdef LITTLE_E\n"
+    "	{\n"
+    "		STORE_TO_TEMP(i) = byteswap(msgLenInBitsh);\n"
+    "		STORE_TO_TEMP(i + 1) = byteswap(msgLenInBitsl);\n"
+    "		i += 2;\n"
+    "	}\n"
+    "\n"
+    "#else\n"
+    "#endif\n"
+    "	\n"
+    "	sha256_step(tb[0], localState);\n"
+    "	if (32 == i)\n"
+    "	{\n"
+    "		sha256_step(tb[1], localState);\n"
+    "	}\n"
+    "	\n"
+    "	for (j=0; j<8; j++)\n"
+    "	{\n"
+    "		state[id*8 + j] = localState[j];\n"
+    "	}\n"
+    "}\n";
+
+#define NUM_BUF_TYPES 3
+#define NUM_KERNELS 2
+
+OCLPerfSHA256::OCLPerfSHA256() { _numSubTests = NUM_BUF_TYPES * NUM_KERNELS; }
+
+OCLPerfSHA256::~OCLPerfSHA256() {}
+
+bool OCLPerfSHA256::setData(cl_mem buffer, unsigned int val) {
+  bool retVal = false;
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+
+  if (error_ != CL_SUCCESS) {
+    printf("\nError code : %d\n", error_);
+  } else {
+    for (unsigned int i = 0; i < width_; i++) data[i] = val;
+    error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0,
+                                               NULL, NULL);
+    if (error_ == CL_SUCCESS) retVal = true;
+  }
+  return retVal;
+}
+
+void OCLPerfSHA256::checkData(cl_mem buffer) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  for (unsigned int i = 0; i < width_; i++) {
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfSHA256::open(unsigned int test, char *units, double &conversion,
+                         unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  inBuffer_ = 0;
+  outBuffer_ = 0;
+  num_input_buf_ = 1;
+  num_output_buf_ = 1;
+  blockSize_ = 1024;
+  isAMD = false;
+
+  width_ = 22347776;
+  // We compute a square domain
+  bufSize_ = width_ * sizeof(cl_uint);
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        isAMD = true;
+      }
+      // platform = platforms[_platformIndex];
+      // break;
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0,
+               "Couldn't find platform with GPU devices, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  switch (_openTest % NUM_BUF_TYPES) {
+    case 0:
+      num_input_buf_ = 1;
+      num_output_buf_ = 1;
+      break;
+
+    case 1:
+      num_input_buf_ = 1;
+      num_output_buf_ = 4;
+      break;
+
+    case 2:
+      num_input_buf_ = 4;
+      num_output_buf_ = 4;
+      break;
+  };
+
+  inBuffer_ = new cl_mem[num_input_buf_];
+  outBuffer_ = new cl_mem[num_output_buf_];
+
+  for (int i = 0; i < num_input_buf_; ++i) {
+    inBuffer_[i] =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(inBuffer_[i] == 0, "clCreateBuffer(inBuffer) failed");
+    bool result = setData(inBuffer_[i], 0xdeadbeef);
+    CHECK_RESULT(result != true, "clEnqueueMapBuffer buffer failed");
+  }
+
+  for (int i = 0; i < num_output_buf_; ++i) {
+    outBuffer_[i] =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(outBuffer_[i] == 0, "clCreateBuffer(outBuffer) failed");
+    bool result = setData(outBuffer_[i], 0xdeadbeef);
+    CHECK_RESULT(result != true, "clEnqueueMapBuffer buffer failed");
+  }
+
+  if (_openTest >= NUM_BUF_TYPES) {
+    program_ = _wrapper->clCreateProgramWithSource(
+        context_, 1, (const char **)&sha256_opt_kernel, NULL, &error_);
+    CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+  } else {
+    program_ = _wrapper->clCreateProgramWithSource(
+        context_, 1, (const char **)&sha256_kernel, NULL, &error_);
+    CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+  }
+
+  const char *buildOps = NULL;
+  if (isAMD) {
+    // Enable caching
+    buildOps = "-fno-alias";
+  }
+  error_ = _wrapper->clBuildProgram(program_, 1, &device, buildOps, NULL, NULL);
+
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "CryptThread", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                    (void *)&inBuffer_[0]);
+  error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem),
+                                    (void *)&outBuffer_[0]);
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint),
+                                    (void *)&blockSize_);
+  // Foo is not part of the original test, this can be used to see how much of
+  // the performance is limited by fetch. Set foo to 0 and all threads will
+  // fetch the same 1k block.  This way they will all be in cache and hit max
+  // fetch speed.
+  unsigned int foo = 1;
+  error_ = _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_uint), (void *)&foo);
+}
+
+void OCLPerfSHA256::run(void) {
+  int global = bufSize_ / blockSize_;
+  // 32 gives the best result due to memory thrashing.  Need to optimize and
+  // give feedback to SiSoft.
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  // Warm-up
+  for (unsigned int i = 0; i < 10; i++) {
+    if (num_input_buf_ > 1) {
+      error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                        (void *)&inBuffer_[i % num_input_buf_]);
+    }
+
+    if (num_output_buf_ > 1) {
+      error_ = _wrapper->clSetKernelArg(
+          kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_[i % num_output_buf_]);
+    }
+
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+  }
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  _wrapper->clFinish(cmd_queue_);
+
+  CPerfCounter timer;
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < MAX_ITERATIONS; i++) {
+    if (num_input_buf_ > 1) {
+      error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                        (void *)&inBuffer_[i % num_input_buf_]);
+    }
+
+    if (num_output_buf_ > 1) {
+      error_ = _wrapper->clSetKernelArg(
+          kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_[i % num_output_buf_]);
+    }
+
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+  }
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // No idea what data should be in here
+  // checkData(outBuffer_);
+  // Compute GB/s
+  double perf =
+      ((double)bufSize_ * (double)MAX_ITERATIONS * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  if (_openTest >= NUM_BUF_TYPES) {
+    testDescString = "opt ";
+  } else {
+    testDescString = "def ";
+  }
+
+  testDescString += "with ";
+  char str[40];
+  sprintf(str, "%2d ip buff and %2d op buff ", num_input_buf_, num_output_buf_);
+  testDescString += str;
+}
+
+unsigned int OCLPerfSHA256::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+
+  if (inBuffer_) {
+    for (int i = 0; i < num_input_buf_; ++i) {
+      error_ = _wrapper->clReleaseMemObject(inBuffer_[i]);
+      CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                             "clReleaseMemObject(inBuffer_) failed");
+    }
+    delete[] inBuffer_;
+  }
+  if (outBuffer_) {
+    for (int i = 0; i < num_output_buf_; ++i) {
+      error_ = _wrapper->clReleaseMemObject(outBuffer_[i]);
+      CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                             "clReleaseMemObject(outBuffer_) failed");
+    }
+    delete[] outBuffer_;
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSHA256.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSHA256.h
new file mode 100644
index 0000000000..60d62efbe5
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSHA256.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_SHA256_H_
+#define _OCL_SHA256_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfSHA256 : public OCLTestImp {
+ public:
+  OCLPerfSHA256();
+  virtual ~OCLPerfSHA256();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  bool setData(cl_mem buffer, unsigned int data);
+  void checkData(cl_mem buffer);
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_mem* inBuffer_;
+  cl_mem* outBuffer_;
+  cl_int num_input_buf_;
+  cl_int num_output_buf_;
+  cl_int error_;
+
+  unsigned int width_;
+  unsigned int bufSize_;
+  unsigned int blockSize_;
+  static const unsigned int MAX_ITERATIONS = 100;
+  bool isAMD;
+};
+
+#endif  // _OCL_SHA256_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMAlloc.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMAlloc.cpp
new file mode 100644
index 0000000000..15746d163b
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMAlloc.cpp
@@ -0,0 +1,263 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfSVMAlloc.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include <sstream>
+#include <string>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 5
+#define NUM_CG_FLAGS 3
+#define NUM_FG_FLAGS 3
+
+static size_t sizeList[NUM_SIZES] = {
+    0x040000, 0x080000, 0x100000, 0x200000, 0x400000,
+};
+
+#if defined(CL_VERSION_2_0)
+static const cl_svm_mem_flags CGFlags[NUM_CG_FLAGS] = {
+    CL_MEM_READ_WRITE,
+    CL_MEM_WRITE_ONLY,
+    CL_MEM_READ_ONLY,
+};
+static const cl_svm_mem_flags FGFlags[NUM_FG_FLAGS] = {
+    0,
+    CL_MEM_SVM_FINE_GRAIN_BUFFER,
+    CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS,
+};
+#endif
+
+static const char *strKernel =
+    "__kernel void dummy(__global uint* out)    \n"
+    "{                                          \n"
+    "   uint id = get_global_id(0);             \n"
+    "   uint value = 1;                         \n"
+    "   if ((int)get_local_id(0) < 0)           \n"
+    "       out[id] = value;                    \n"
+    "}                                          \n";
+
+OCLPerfSVMAlloc::OCLPerfSVMAlloc() {
+  _numSubTests = NUM_CG_FLAGS * NUM_FG_FLAGS * NUM_SIZES + NUM_SIZES;
+  failed_ = false;
+  skip_ = false;
+}
+
+OCLPerfSVMAlloc::~OCLPerfSVMAlloc() {}
+
+void OCLPerfSVMAlloc::open(unsigned int test, char *units, double &conversion,
+                           unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+#if defined(CL_VERSION_2_0)
+  FGSystem_ = (test >= (NUM_CG_FLAGS * NUM_FG_FLAGS * NUM_SIZES));
+  testFGFlag_ = (test / (NUM_SIZES * NUM_CG_FLAGS)) % NUM_FG_FLAGS;
+  testCGFlag_ = (test / NUM_SIZES) % NUM_CG_FLAGS;
+  testSize_ = test % NUM_SIZES;
+
+  cl_device_svm_capabilities caps;
+  error_ = clGetDeviceInfo(devices_[deviceId], CL_DEVICE_SVM_CAPABILITIES,
+                           sizeof(cl_device_svm_capabilities), &caps, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if ((caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) == 0) {
+    skip_ = true;  // Should never happen as OCL 2.0 devices are required to
+                   // support coarse grain SVM
+    testDescString = "Coarse Grain Buffer  NOT supported. Test Skipped.";
+    return;
+  } else if (testFGFlag_ > 0 && (caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) == 0) {
+    skip_ = true;  // No support for fine grain buffer SVM
+    testDescString = "Fine Grain Buffer NOT supported. Test Skipped.";
+    return;
+  } else if (FGSystem_ && (caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) == 0) {
+    skip_ = true;  // No support for fine grain system SVM
+    testDescString = "Fine Grain System NOT supported. Test Skipped.";
+    return;
+  } else if (testFGFlag_ == 2 && (caps & CL_DEVICE_SVM_ATOMICS) == 0) {
+    skip_ = true;  // No support for fine grain system SVM
+    testDescString = "SVM Atomic        NOT supported. Test Skipped.";
+    return;
+  }
+
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    printf("GPU device is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+  kernel_ = _wrapper->clCreateKernel(program_, "dummy", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+  return;
+#else
+  skip_ = true;
+  testDescString = "SVM NOT supported for < 2.0 builds. Test Skipped.";
+  return;
+#endif
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfSVMAlloc::run(void) {
+  if (skip_) {
+    return;
+  }
+
+  if (failed_) {
+    return;
+  }
+#if defined(CL_VERSION_2_0)
+  cl_uint *buffer = NULL;
+  CPerfCounter timer;
+  void *hostPtr = NULL;
+
+  size_t bufSize = sizeList[testSize_] * sizeof(cl_int4);
+  size_t iter = 100;
+
+  cl_mem_flags flags = CGFlags[testCGFlag_] | FGFlags[testFGFlag_];
+
+  timer.Reset();
+  timer.Start();
+
+  size_t gws[1] = {bufSize / sizeof(cl_int4)};
+  size_t lws[1] = {64};
+
+  for (size_t i = 0; i < iter; ++i) {
+    if (!FGSystem_) {
+      buffer = (cl_uint *)clSVMAlloc(context_, flags, bufSize, 0);
+    } else {
+      buffer = (cl_uint *)malloc(bufSize);
+    }
+    CHECK_RESULT(buffer == 0, "Allocation failed");
+
+    error_ = _wrapper->clSetKernelArgSVMPointer(kernel_, 0, buffer);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, gws, lws, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+    _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+    if (!FGSystem_) {
+      clSVMFree(context_, (void *)buffer);
+    } else {
+      free(buffer);
+    }
+  }
+
+  timer.Stop();
+
+  CPerfCounter timer2;
+  timer2.Reset();
+  size_t numN = 100;
+
+  if (!FGSystem_) {
+    buffer = (cl_uint *)clSVMAlloc(context_, flags, bufSize, 0);
+  } else {
+    buffer = (cl_uint *)malloc(bufSize);
+  }
+  CHECK_RESULT(buffer == 0, "Allocation failed");
+
+  timer2.Start();
+  for (size_t i = 0; i < numN; ++i) {
+    error_ = _wrapper->clSetKernelArgSVMPointer(kernel_, 0, buffer);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, gws, lws, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  }
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  timer2.Stop();
+
+  if (!FGSystem_) {
+    clSVMFree(context_, (void *)buffer);
+  } else {
+    free(buffer);
+  }
+
+  char pFlags[5];
+  pFlags[0] =
+      (testCGFlag_ == 0 || testCGFlag_ == 2) ? 'R' : '_';  // CL_MEM_READ_ONLY
+  pFlags[1] =
+      (testCGFlag_ == 0 || testCGFlag_ == 1) ? 'W' : '_';  // CL_MEM_WRITE_ONLY
+  pFlags[2] = (testFGFlag_ == 1 || testFGFlag_ == 2)
+                  ? 'F'
+                  : '_';                       // CL_MEM_SVM_FINE_GRAIN_BUFFER
+  pFlags[3] = (testFGFlag_ == 2) ? 'A' : '_';  // CL_MEM_SVM_ATOMICS
+
+  char buf[256];
+
+  if (!FGSystem_ && (testFGFlag_ == 0)) {
+    SNPRINTF(buf, sizeof(buf),
+             "Coarse Grain Buffer Alloc + Free (GB/s) for %6d KB, flags=%4s",
+             (int)bufSize / 1024, pFlags);
+  } else if (!FGSystem_ && (testFGFlag_ > 0)) {
+    SNPRINTF(buf, sizeof(buf),
+             "Fine Grain Buffer   Alloc + Free (GB/s) for %6d KB, flags=%4s",
+             (int)bufSize / 1024, pFlags);
+  } else if (FGSystem_) {
+    SNPRINTF(buf, sizeof(buf),
+             "Fine Grain System   Alloc + Free (GB/s) for %6d KB, flags=N/A ",
+             (int)bufSize / 1024);
+  }
+
+  testDescString = buf;
+  double sec1 = timer.GetElapsedTime();
+  double sec2 = timer2.GetElapsedTime();
+  _perfInfo = static_cast<float>((bufSize * (double)(1e-09)) /
+                                 (sec1 / iter - sec2 / numN));
+#endif
+}
+
+unsigned int OCLPerfSVMAlloc::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMAlloc.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMAlloc.h
new file mode 100644
index 0000000000..4a4818a3c6
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMAlloc.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PERF_SVM_ALLOC_H_
+#define _OCL_PERF_SVM_ALLOC_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfSVMAlloc : public OCLTestImp {
+ public:
+  OCLPerfSVMAlloc();
+  virtual ~OCLPerfSVMAlloc();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+  unsigned int testSize_;
+  bool FGSystem_;
+  unsigned int testCGFlag_;
+  unsigned int testFGFlag_;
+  bool skip_;
+};
+
+#endif  // _OCL_PERF_SVM_ALLOC_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMKernelArguments.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMKernelArguments.cpp
new file mode 100644
index 0000000000..5c9be9f3e4
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMKernelArguments.cpp
@@ -0,0 +1,255 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfSVMKernelArguments.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include <sstream>
+#include <string>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+static const size_t BufSize = 0x1000;
+static const size_t Iterations = 0x10000;
+static const size_t TotalQueues = 4;
+static const size_t TotalBufs = 4;
+static const size_t TotalArgs = 4;
+
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+static const char *Arguments[TotalArgs] = {
+    "__global uint* out",
+    "__global uint* out, __global uint* buf0, __global uint* buf1, __global "
+    "uint* buf2, __global uint* buf3",
+    "__global uint* out, __global uint* buf0, __global uint* buf1, __global "
+    "uint* buf2, __global uint* buf3, \n"
+    "__global uint* buf4, __global uint* buf5, __global uint* buf6, __global "
+    "uint* buf7, __global uint* buf8",
+    "__global uint* out, __global uint* buf0, __global uint* buf1, __global "
+    "uint* buf2, __global uint* buf3,\n"
+    "__global uint* buf4, __global uint* buf5, __global uint* buf6, __global "
+    "uint* buf7, __global uint* buf8,\n"
+    "__global uint* buf9, __global uint* buf10, __global uint* buf11, __global "
+    "uint* buf12, __global uint* buf13,\n"
+    "__global uint* buf14, __global uint* buf15, __global uint* buf16, "
+    "__global uint* buf17, __global uint* buf18"};
+
+static const char *strKernel =
+    "__kernel void dummy(%s)                    \n"
+    "{                                          \n"
+    "   uint id = get_global_id(0);             \n"
+    "   uint value = 1;                         \n"
+    "   out[id] = value;                        \n"
+    "}                                          \n";
+
+OCLPerfSVMKernelArguments::OCLPerfSVMKernelArguments() {
+  _numSubTests = TotalQueues * TotalArgs;  // * TotalBufs;
+  failed_ = false;
+  skip_ = false;
+}
+
+OCLPerfSVMKernelArguments::~OCLPerfSVMKernelArguments() {}
+
+void OCLPerfSVMKernelArguments::open(unsigned int test, char *units,
+                                     double &conversion,
+                                     unsigned int deviceId) {
+#if defined(CL_VERSION_2_0)
+  // cl_mem  buffer;
+  _deviceId = deviceId;
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  test_ = test;
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  cl_device_svm_capabilities caps;
+  error_ = clGetDeviceInfo(devices_[deviceId], CL_DEVICE_SVM_CAPABILITIES,
+                           sizeof(cl_device_svm_capabilities), &caps, NULL);
+  // check if CL_DEVICE_SVM_COARSE_GRAIN_BUFFER is set. Skip the test if not.
+  if (!(caps & 0x1)) {
+    skip_ = true;
+    testDescString = "SVM NOT supported. Test Skipped.";
+    return;
+  }
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    printf("GPU device is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+
+  size_t numArguments = (test_ / TotalQueues) % TotalArgs;
+  char *program = new char[4096];
+  SNPRINTF(program, sizeof(char) * 4096, strKernel, Arguments[numArguments]);
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&program, NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+  kernel_ = _wrapper->clCreateKernel(program_, "dummy", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  delete[] program;
+
+  static const size_t NumBuffs[TotalBufs] = {0x20, 0x100, 0x800, 0x2000};
+
+  size_t bufSize = BufSize * sizeof(cl_int);
+
+  numBufs_ = (unsigned int)NumBuffs[test_ / (TotalQueues * TotalArgs)];
+  inOutBuffer = (void **)malloc(sizeof(void *) * numBufs_);
+
+  for (size_t b = 0; b < numBufs_; ++b) {
+    inOutBuffer[b] = clSVMAlloc(context_, CL_MEM_READ_WRITE, bufSize, 0);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSVMAlloc() failed");
+  }
+#else
+  skip_ = true;
+  testDescString = "SVM NOT supported for < 2.0 builds. Test Skipped.";
+  return;
+#endif
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfSVMKernelArguments::run(void) {
+  if (skip_) {
+    return;
+  }
+
+  if (failed_) {
+    return;
+  }
+#if defined(CL_VERSION_2_0)
+  CPerfCounter timer;
+  static const size_t Queues[] = {1, 2, 4, 8};
+  size_t numQueues = Queues[test_ % TotalQueues];
+  cl_uint numArguments;
+  _wrapper->clGetKernelInfo(kernel_, CL_KERNEL_NUM_ARGS, sizeof(cl_uint),
+                            &numArguments, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clGetKernelInfo() failed");
+
+  size_t iter = Iterations / numQueues / numBufs_;
+  iter = (iter == 0) ? 1 : iter;
+
+  std::vector<cl_command_queue> cmdQueues(numQueues);
+  for (size_t q = 0; q < numQueues; ++q) {
+    cl_command_queue cmdQueue = _wrapper->clCreateCommandQueue(
+        context_, devices_[_deviceId], 0, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed");
+    cmdQueues[q] = cmdQueue;
+  }
+  // Warm-up
+  for (size_t b = 0; b < (numBufs_ / numArguments); ++b) {
+    for (size_t q = 0; q < numQueues; ++q) {
+      for (cl_uint a = 0; a < numArguments; ++a) {
+        void *buffer = inOutBuffer[(b * numArguments + a) % numBufs_];
+        error_ = _wrapper->clSetKernelArgSVMPointer(kernel_, a, buffer);
+        CHECK_RESULT((error_ != CL_SUCCESS),
+                     "clSetKernelArgSVMPointer() failed");
+      }
+
+      size_t gws[1] = {256};
+      size_t lws[1] = {256};
+      error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL,
+                                                gws, lws, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+    }
+  }
+  for (size_t q = 0; q < numQueues; ++q) {
+    _wrapper->clFinish(cmdQueues[q]);
+  }
+
+  size_t disp = 0;
+  timer.Reset();
+  timer.Start();
+
+  for (size_t i = 0; i < iter; ++i) {
+    for (size_t b = 0; b < numBufs_; ++b) {
+      for (size_t q = 0; q < numQueues; ++q) {
+        for (cl_uint a = 0; a < numArguments; ++a) {
+          void *buffer = inOutBuffer[(b * numArguments + a) % numBufs_];
+          error_ = _wrapper->clSetKernelArgSVMPointer(kernel_, a, buffer);
+          CHECK_RESULT((error_ != CL_SUCCESS),
+                       "clSetKernelArgSVMPointer() failed");
+        }
+
+        size_t gws[1] = {256};
+        size_t lws[1] = {256};
+        error_ = _wrapper->clEnqueueNDRangeKernel(
+            cmdQueues[q], kernel_, 1, NULL, gws, lws, 0, NULL, NULL);
+        CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+        disp++;
+      }
+    }
+  }
+  for (size_t q = 0; q < numQueues; ++q) {
+    _wrapper->clFinish(cmdQueues[q]);
+  }
+  timer.Stop();
+
+  for (size_t q = 0; q < numQueues; ++q) {
+    error_ = _wrapper->clReleaseCommandQueue(cmdQueues[q]);
+    CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                           "clReleaseCommandQueue() failed");
+  }
+
+  std::stringstream stream;
+  stream << "Setup time (us) for " << numQueues << " queues, ";
+  stream.flags(std::ios::right | std::ios::showbase);
+  stream.width(2);
+  stream << numArguments;
+  stream << " arguments, ";
+  stream.flags(std::ios::right | std::ios::showbase);
+  stream.width(4);
+  stream << numBufs_ << " buffers";
+  testDescString = stream.str();
+  _perfInfo = static_cast<float>(timer.GetElapsedTime() * 1000000 / disp);
+#endif
+}
+
+unsigned int OCLPerfSVMKernelArguments::close(void) {
+#if defined(CL_VERSION_2_0)
+  for (size_t b = 0; b < numBufs_; ++b) {
+    _wrapper->clSVMFree(context_, inOutBuffer[b]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clSVMFree() failed");
+  }
+#endif
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMKernelArguments.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMKernelArguments.h
new file mode 100644
index 0000000000..4b08fde849
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMKernelArguments.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PERF_SVM_KERNEL_ARGUMENTS_H_
+#define _OCL_PERF_SVM_KERNEL_ARGUMENTS_H_
+
+#include <vector>
+
+#include "OCLTestImp.h"
+
+class OCLPerfSVMKernelArguments : public OCLTestImp {
+ public:
+  OCLPerfSVMKernelArguments();
+  virtual ~OCLPerfSVMKernelArguments();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+  unsigned int test_;
+  bool skip_;
+  void** inOutBuffer;
+  unsigned int numBufs_;
+};
+
+#endif  // _OCL_PERF_SVM_KERNEL_ARGUMENTS_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMap.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMap.cpp
new file mode 100644
index 0000000000..e0a7aef3c3
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMap.cpp
@@ -0,0 +1,153 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfSVMMap.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include <sstream>
+#include <string>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 5
+static size_t sizeList[] = {
+    0x040000, 0x080000, 0x100000, 0x200000, 0x400000,
+};
+
+#define NUM_FLAGS 4
+static const cl_map_flags Flags[NUM_FLAGS] = {CL_MAP_READ, CL_MAP_WRITE,
+                                              CL_MAP_READ | CL_MAP_WRITE,
+                                              CL_MAP_WRITE_INVALIDATE_REGION};
+
+OCLPerfSVMMap::OCLPerfSVMMap() {
+  _numSubTests = NUM_SIZES * NUM_FLAGS;
+  failed_ = false;
+  skip_ = false;
+}
+
+OCLPerfSVMMap::~OCLPerfSVMMap() {}
+
+void OCLPerfSVMMap::open(unsigned int test, char *units, double &conversion,
+                         unsigned int deviceId) {
+#if defined(CL_VERSION_2_0)
+  _deviceId = deviceId;
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  testFlag_ = test / NUM_SIZES;
+  testSize_ = test % NUM_SIZES;
+
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  cl_device_svm_capabilities caps;
+  error_ = clGetDeviceInfo(devices_[deviceId], CL_DEVICE_SVM_CAPABILITIES,
+                           sizeof(cl_device_svm_capabilities), &caps, NULL);
+  // check if CL_DEVICE_SVM_COARSE_GRAIN_BUFFER is set. Skip the test if not.
+  if (!(caps & 0x1)) {
+    skip_ = true;
+    testDescString = "SVM NOT supported. Test Skipped.";
+    return;
+  }
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    printf("GPU device is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+#else
+  skip_ = true;
+  testDescString = "SVM NOT supported for < 2.0 builds. Test Skipped.";
+  return;
+#endif
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfSVMMap::run(void) {
+  if (skip_) {
+    return;
+  }
+
+  if (failed_) {
+    return;
+  }
+#if defined(CL_VERSION_2_0)
+  void *buffer;
+  CPerfCounter timer;
+  void *hostPtr = NULL;
+
+  const size_t bufSize = sizeList[testSize_] * sizeof(cl_int4);
+  const cl_map_flags flag = Flags[testFlag_];
+  const size_t iter = 100;
+
+  timer.Reset();
+
+  buffer = clSVMAlloc(context_, CL_MEM_READ_WRITE, bufSize, 0);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSVMAlloc() failed");
+
+  for (size_t i = 0; i < iter; ++i) {
+    timer.Start();
+
+    error_ = clEnqueueSVMMap(cmdQueues_[_deviceId], CL_FALSE, flag, buffer,
+                             bufSize, 0, 0, 0);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueSVMMap() failed");
+
+    error_ = clEnqueueSVMUnmap(cmdQueues_[_deviceId], buffer, 0, 0, 0);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueSVMUnmap() failed");
+
+    _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+    timer.Stop();
+  }
+
+  clSVMFree(context_, (void *)buffer);
+
+  char pFlags[4];
+  pFlags[0] = (testFlag_ == 0 || testFlag_ == 2) ? 'R' : '_';  // CL_MAP_READ
+  pFlags[1] = (testFlag_ == 1 || testFlag_ == 2) ? 'W' : '_';  // CL_MAP_WRITE
+  pFlags[2] = (testFlag_ == 3) ? 'I' : '_';  // CL_MAP_WRITE_INVALIDATE_REGION
+
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), "Map + Unmap (GB/s) for %6d KB, flags=%3s",
+           (int)bufSize / 1024, pFlags);
+
+  testDescString = buf;
+  double sec = timer.GetElapsedTime();
+  _perfInfo = static_cast<float>((bufSize * iter * (double)(1e-09)) / sec);
+#endif
+}
+
+unsigned int OCLPerfSVMMap::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMap.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMap.h
new file mode 100644
index 0000000000..eedc6b7d2a
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMap.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PERF_SVM_MAP_H_
+#define _OCL_PERF_SVM_MAP_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfSVMMap : public OCLTestImp {
+ public:
+  OCLPerfSVMMap();
+  virtual ~OCLPerfSVMMap();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+  unsigned int testSize_;
+  unsigned int testFlag_;
+  bool skip_;
+};
+
+#endif  // _OCL_PERF_SVM_MAP_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemFill.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemFill.cpp
new file mode 100644
index 0000000000..24c45a6b2a
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemFill.cpp
@@ -0,0 +1,214 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfSVMMemFill.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include <sstream>
+#include <string>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_MODES 3
+#define NUM_CG_FLAGS 2
+#define NUM_FG_FLAGS 3
+
+static size_t typeSizeList[] = {
+    1,  // sizeof(cl_uchar)
+    2,   4, 8, 16, 32, 64,
+    128,  // sizeof(cl_ulong16)
+};
+
+static unsigned int eleNumList[] = {
+    0x0020000, 0x0080000, 0x0200000, 0x0800000, 0x2000000,
+};
+
+#if defined(CL_VERSION_2_0)
+static const cl_svm_mem_flags CGFlags[NUM_CG_FLAGS] = {
+    CL_MEM_READ_WRITE,
+    CL_MEM_WRITE_ONLY,
+};
+static const cl_svm_mem_flags FGFlags[NUM_FG_FLAGS] = {
+    0,
+    CL_MEM_SVM_FINE_GRAIN_BUFFER,
+    CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS,
+};
+#endif
+
+OCLPerfSVMMemFill::OCLPerfSVMMemFill() {
+  num_typeSize_ = sizeof(typeSizeList) / sizeof(size_t);
+  num_elements_ = sizeof(eleNumList) / sizeof(unsigned int);
+  _numSubTests =
+      num_elements_ * num_typeSize_ * (NUM_FG_FLAGS * NUM_CG_FLAGS + 1);
+  failed_ = false;
+  skip_ = false;
+}
+
+OCLPerfSVMMemFill::~OCLPerfSVMMemFill() {}
+
+void OCLPerfSVMMemFill::open(unsigned int test, char *units, double &conversion,
+                             unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+#if defined(CL_VERSION_2_0)
+  FGSystem_ =
+      (test >= (num_elements_ * num_typeSize_ * NUM_FG_FLAGS * NUM_CG_FLAGS));
+  testFGFlag_ =
+      (test / (num_elements_ * num_typeSize_ * NUM_CG_FLAGS)) % NUM_FG_FLAGS;
+  testCGFlag_ = (test / (num_elements_ * num_typeSize_)) % NUM_CG_FLAGS;
+  testTypeSize_ = typeSizeList[(test / num_elements_) % num_typeSize_];
+  testNumEle_ = eleNumList[test % num_elements_];
+
+  cl_device_svm_capabilities caps;
+  error_ = clGetDeviceInfo(devices_[deviceId], CL_DEVICE_SVM_CAPABILITIES,
+                           sizeof(cl_device_svm_capabilities), &caps, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if ((caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) == 0) {
+    skip_ = true;  // Should never happen as OCL 2.0 devices are required to
+                   // support coarse grain SVM
+    testDescString = "Coarse Grain Buffer  NOT supported. Test Skipped.";
+    return;
+  } else if (testFGFlag_ > 0 && (caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) == 0) {
+    skip_ = true;  // No support for fine grain buffer SVM
+    testDescString = "Fine Grain Buffer NOT supported. Test Skipped.";
+    return;
+  } else if (FGSystem_ && (caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) == 0) {
+    skip_ = true;  // No support for fine grain system SVM
+    testDescString = "Fine Grain System NOT supported. Test Skipped.";
+    return;
+  } else if (testFGFlag_ == 2 && ((caps & CL_DEVICE_SVM_ATOMICS) == 0)) {
+    skip_ = true;  // No support for SVM Atomic
+    testDescString = "SVM Atomic        NOT supported. Test Skipped.";
+    return;
+  }
+
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    printf("GPU device is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+  return;
+#else
+  skip_ = true;
+  testDescString = "SVM NOT supported for < 2.0 builds. Test Skipped.";
+  return;
+#endif
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfSVMMemFill::run(void) {
+  if (skip_) {
+    return;
+  }
+
+  if (failed_) {
+    return;
+  }
+#if defined(CL_VERSION_2_0)
+  cl_uint *buffer = NULL;
+  CPerfCounter timer;
+  size_t iter = 100, bufSize = testNumEle_ * 4;
+
+  cl_mem_flags flags = CGFlags[testCGFlag_] | FGFlags[testFGFlag_];
+
+  void *data = malloc(bufSize);
+
+  timer.Reset();
+
+  if (!FGSystem_) {
+    buffer =
+        (cl_uint *)clSVMAlloc(context_, flags, bufSize, (cl_uint)testTypeSize_);
+    CHECK_RESULT(buffer == 0, "Allocation failed");
+  } else {  // FGSystem_ = true
+    buffer = (cl_uint *)malloc(bufSize);
+    CHECK_RESULT(buffer == 0, "Allocation failed");
+  }
+
+  timer.Start();
+  for (size_t i = 0; i < iter; ++i) {
+    error_ = clEnqueueSVMMemFill(cmdQueues_[_deviceId], buffer, data,
+                                 testTypeSize_, bufSize, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueSVMMemFill() failed");
+  }
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  timer.Stop();
+
+  if (!FGSystem_) {
+    clSVMFree(context_, (void *)buffer);
+  } else {
+    free(buffer);
+  }
+
+  char pFlags[5];
+  pFlags[0] =
+      (testCGFlag_ == 0 || testCGFlag_ == 2) ? 'R' : '_';  // CL_MEM_READ_ONLY
+  pFlags[1] =
+      (testCGFlag_ == 0 || testCGFlag_ == 1) ? 'W' : '_';  // CL_MEM_WRITE_ONLY
+  pFlags[2] = (testFGFlag_ == 1 || testFGFlag_ == 2)
+                  ? 'F'
+                  : '_';                       // CL_MEM_SVM_FINE_GRAIN_BUFFER
+  pFlags[3] = (testFGFlag_ == 2) ? 'A' : '_';  // CL_MEM_SVM_ATOMICS
+
+  char buf[256];
+
+  if (!FGSystem_ && (testFGFlag_ == 0)) {
+    SNPRINTF(buf, sizeof(buf),
+             "Coarse Grain Buffer SVMMemFill (GB/s) for %6d KB, typeSize:%3d, "
+             "flags=%4s",
+             (int)bufSize / 1024, (int)testTypeSize_, pFlags);
+  } else if (!FGSystem_ && (testFGFlag_ > 0)) {
+    SNPRINTF(buf, sizeof(buf),
+             "Fine Grain Buffer   SVMMemFill (GB/s) for %6d KB, typeSize:%3d, "
+             "flags=%4s",
+             (int)bufSize / 1024, (int)testTypeSize_, pFlags);
+  } else if (FGSystem_) {
+    SNPRINTF(buf, sizeof(buf),
+             "Fine Grain System   SVMMemFill (GB/s) for %6d KB, typeSize:%3d, "
+             "flags=%4s",
+             (int)bufSize / 1024, (int)testTypeSize_, pFlags);
+  }
+
+  testDescString = buf;
+  double sec = timer.GetElapsedTime();
+  _perfInfo = static_cast<float>((bufSize * iter * (double)(1e-09)) / sec);
+#endif
+}
+
+unsigned int OCLPerfSVMMemFill::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemFill.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemFill.h
new file mode 100644
index 0000000000..37ceed8d49
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemFill.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PERF_SVM_MEMFILL_H_
+#define _OCL_PERF_SVM_MEMFILL_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfSVMMemFill : public OCLTestImp {
+ public:
+  OCLPerfSVMMemFill();
+  virtual ~OCLPerfSVMMemFill();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  unsigned int num_typeSize_;
+  unsigned int num_elements_;
+  bool FGSystem_;
+  size_t testTypeSize_;
+  unsigned int testCGFlag_;
+  unsigned int testFGFlag_;
+  unsigned int testNumEle_;
+  bool atomic_;
+  bool failed_;
+  bool skip_;
+};
+
+#endif  // _OCL_PERF_SVM_MEMFILL_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemcpy.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemcpy.cpp
new file mode 100644
index 0000000000..33f0c05146
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemcpy.cpp
@@ -0,0 +1,216 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfSVMMemcpy.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include <sstream>
+#include <string>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 5
+#define NUM_SRC_FLAGS 2
+#define NUM_DST_FLAGS 2
+#define NUM_FG_FLAGS 3
+
+static size_t sizeList[NUM_SIZES] = {
+    0x040000, 0x080000, 0x100000, 0x200000, 0x400000,
+};
+
+#if defined(CL_VERSION_2_0)
+static const cl_svm_mem_flags srcFlagList[NUM_SRC_FLAGS] = {CL_MEM_READ_WRITE,
+                                                            CL_MEM_READ_ONLY};
+static const cl_svm_mem_flags dstFlagList[NUM_DST_FLAGS] = {CL_MEM_READ_WRITE,
+                                                            CL_MEM_WRITE_ONLY};
+static const cl_svm_mem_flags FGFlags[NUM_FG_FLAGS] = {
+    0,
+    CL_MEM_SVM_FINE_GRAIN_BUFFER,
+    CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS,
+};
+#endif
+
+OCLPerfSVMMemcpy::OCLPerfSVMMemcpy() {
+  _numSubTests = (NUM_SRC_FLAGS * NUM_DST_FLAGS * NUM_FG_FLAGS + 1) * NUM_SIZES;
+  failed_ = false;
+  skip_ = false;
+}
+
+OCLPerfSVMMemcpy::~OCLPerfSVMMemcpy() {}
+
+void OCLPerfSVMMemcpy::open(unsigned int test, char *units, double &conversion,
+                            unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+#if defined(CL_VERSION_2_0)
+  FGSystem_ =
+      (test >= (NUM_SIZES * NUM_SRC_FLAGS * NUM_DST_FLAGS * NUM_FG_FLAGS));
+  testFGFlag_ =
+      (test / (NUM_SIZES * NUM_DST_FLAGS * NUM_SRC_FLAGS)) % (NUM_FG_FLAGS);
+  testSrcFlag_ = (test / (NUM_SIZES * NUM_DST_FLAGS)) % (NUM_SRC_FLAGS);
+  testDstFlag_ = (test / NUM_SIZES) % (NUM_DST_FLAGS);
+  testSize_ = test % NUM_SIZES;
+
+  cl_device_svm_capabilities caps;
+  error_ = clGetDeviceInfo(devices_[deviceId], CL_DEVICE_SVM_CAPABILITIES,
+                           sizeof(cl_device_svm_capabilities), &caps, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if ((caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) == 0) {
+    skip_ = true;  // Should never happen as OCL 2.0 devices are required to
+                   // support coarse grain SVM
+    testDescString = "Coarse Grain Buffer  NOT supported. Test Skipped.";
+    return;
+  } else if ((testFGFlag_ > 0) &&
+             (caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) == 0) {
+    skip_ = true;  // No support for fine grain buffer SVM
+    testDescString = "Fine Grain Buffer NOT supported. Test Skipped.";
+    return;
+  } else if (FGSystem_ && (caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) == 0) {
+    skip_ = true;  // No support for fine grain system SVM
+    testDescString = "Fine Grain System NOT supported. Test Skipped.";
+    return;
+  } else if ((testFGFlag_ == 2) && ((caps & CL_DEVICE_SVM_ATOMICS) == 0)) {
+    skip_ = true;  // No support for SVM Atomic
+    testDescString = "SVM Atomic        NOT supported. Test Skipped.";
+    return;
+  }
+
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    printf("GPU device is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+  return;
+#else
+  skip_ = true;
+  testDescString = "SVM NOT supported for < 2.0 builds. Test Skipped.";
+  return;
+#endif
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfSVMMemcpy::run(void) {
+  if (skip_) {
+    return;
+  }
+
+  if (failed_) {
+    return;
+  }
+#if defined(CL_VERSION_2_0)
+  cl_uint *src = NULL, *dst = NULL;
+  CPerfCounter timer;
+
+  size_t bufSize = sizeList[testSize_] * sizeof(cl_int4);
+  size_t iter = 100;
+
+  cl_mem_flags srcFlags = srcFlagList[testSrcFlag_] | FGFlags[testFGFlag_];
+  cl_mem_flags dstFlags = dstFlagList[testDstFlag_] | FGFlags[testFGFlag_];
+
+  size_t gws[1] = {bufSize / sizeof(cl_int4)};
+  size_t lws[1] = {64};
+
+  if (!FGSystem_) {
+    src = (cl_uint *)clSVMAlloc(context_, srcFlags, bufSize, 0);
+    CHECK_RESULT(src == 0, "Allocation failed");
+    dst = (cl_uint *)clSVMAlloc(context_, dstFlags, bufSize, 0);
+    CHECK_RESULT(dst == 0, "Allocation failed");
+  } else {  // FGSystem_ == true
+    src = (cl_uint *)malloc(bufSize);
+    dst = (cl_uint *)malloc(bufSize);
+  }
+
+  timer.Reset();
+  timer.Start();
+  for (size_t i = 0; i < iter; ++i) {
+    clEnqueueSVMMemcpy(cmdQueues_[_deviceId], false, dst, src, bufSize, 0, NULL,
+                       NULL);
+  }
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  timer.Stop();
+
+  if (!FGSystem_) {
+    clSVMFree(context_, (void *)src);
+    clSVMFree(context_, (void *)dst);
+  } else {  // FGSystem_ = true
+    free(src);
+    free(dst);
+  }
+
+  char pSrcFlags[5];
+  pSrcFlags[0] =
+      (testSrcFlag_ == 0 || testSrcFlag_ == 1) ? 'R' : '_';  // CL_MEM_READ_ONLY
+  pSrcFlags[1] = (testSrcFlag_ == 0) ? 'W' : '_';  // CL_MEM_WRITE_ONLY
+  pSrcFlags[2] = (testFGFlag_ == 1 || testFGFlag_ == 2)
+                     ? 'F'
+                     : '_';  // CL_MEM_SVM_FINE_GRAIN_BUFFER
+  pSrcFlags[3] = (testFGFlag_ == 2) ? 'A' : '_';  // CL_MEM_SVM_ATOMICS
+  pSrcFlags[4] = '\0';
+
+  char pDstFlags[5];
+  pDstFlags[0] = (testDstFlag_ == 0) ? 'R' : '_';
+  pDstFlags[1] = (testDstFlag_ == 0 || testDstFlag_ == 1) ? 'W' : '_';
+  pDstFlags[2] = (testFGFlag_ == 1 || testFGFlag_ == 2) ? 'F' : '_';
+  pDstFlags[3] = (testFGFlag_ == 2) ? 'A' : '_';
+  pSrcFlags[4] = '\0';
+
+  char buf[256];
+
+  if (FGSystem_) {
+    SNPRINTF(buf, sizeof(buf),
+             "Fine Grain System   SVMMemcpy (GB/s) for %6d KB, from:%4s to:%4s",
+             (int)bufSize / 1024, pSrcFlags, pDstFlags);
+  } else if (testFGFlag_ == 0) {
+    SNPRINTF(buf, sizeof(buf),
+             "Coarse Grain Buffer SVMMemcpy (GB/s) for %6d KB, from:%4s to:%4s",
+             (int)bufSize / 1024, pSrcFlags, pDstFlags);
+  } else {
+    SNPRINTF(buf, sizeof(buf),
+             "Fine Grain Buffer   SVMMemcpy (GB/s) for %6d KB, from:%4s to:%4s",
+             (int)bufSize / 1024, pSrcFlags, pDstFlags);
+  }
+
+  testDescString = buf;
+  double sec = timer.GetElapsedTime();
+  _perfInfo = static_cast<float>((bufSize * iter * (double)(1e-09)) / sec);
+#endif
+}
+
+unsigned int OCLPerfSVMMemcpy::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemcpy.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemcpy.h
new file mode 100644
index 0000000000..32fe4fb49e
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemcpy.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PERF_SVM_MEMCPY_H_
+#define _OCL_PERF_SVM_MEMCPY_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfSVMMemcpy : public OCLTestImp {
+ public:
+  OCLPerfSVMMemcpy();
+  virtual ~OCLPerfSVMMemcpy();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+  unsigned int testSize_;
+  unsigned int testSrcFlag_;
+  unsigned int testDstFlag_;
+  unsigned int testFGFlag_;
+  bool FGSystem_;
+  bool skip_;
+};
+
+#endif  // _OCL_PERF_SVM_MEMCPY_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMSampleRate.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMSampleRate.cpp
new file mode 100644
index 0000000000..f13e4cc410
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMSampleRate.cpp
@@ -0,0 +1,359 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfSVMSampleRate.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_TYPES 3
+static const char *types[NUM_TYPES] = {"float", "float2", "float4"};
+static const unsigned int typeSizes[NUM_TYPES] = {4, 8, 16};
+
+#define NUM_SIZES 12
+static const unsigned int sizes[NUM_SIZES] = {1,  2,   4,   8,   16,   32,
+                                              64, 128, 256, 512, 1024, 2048};
+
+#define NUM_BUFS 6
+#define MAX_BUFS (1 << (NUM_BUFS - 1))
+
+#define NUM_READS numBufs_
+
+OCLPerfSVMSampleRate::OCLPerfSVMSampleRate() {
+  _numSubTests = NUM_TYPES * NUM_SIZES * NUM_BUFS * 3;
+  skip_ = false;
+}
+
+OCLPerfSVMSampleRate::~OCLPerfSVMSampleRate() {}
+
+void OCLPerfSVMSampleRate::setKernel(void) {
+  shader_.clear();
+  shader_ +=
+      "kernel void sampleRate(global DATATYPE* outBuffer, unsigned int "
+      "inBufSize, unsigned int writeIt,\n";
+  char buf[256];
+  for (unsigned int i = 0; i < numBufs_; i++) {
+    SNPRINTF(buf, sizeof(buf), "global DATATYPE* inBuffer%d", i);
+    shader_ += buf;
+    if (i < (numBufs_ - 1)) {
+      shader_ += ",";
+    }
+    shader_ += "\n";
+  }
+  shader_ += ")\n";
+  shader_ +=
+      "{\n"
+      "    uint gid = get_global_id(0);\n"
+      "    uint inputIdx = gid % inBufSize;\n"
+      "    DATATYPE tmp = (DATATYPE)0.0f;\n";
+
+  for (unsigned int j = 0; j < (NUM_READS / numBufs_); j++) {
+    for (unsigned int i = 0; i < numBufs_; i++) {
+      SNPRINTF(buf, sizeof(buf), "    tmp += inBuffer%d[inputIdx];\n", i);
+      shader_ += buf;
+    }
+    shader_ += "    inputIdx += writeIt;\n";  // writeIt is 0, so we don't need
+                                              // to add a modulo
+  }
+  if (typeSizes[typeIdx_] > 4) {
+    shader_ +=
+        "    if (writeIt*(unsigned int)tmp.x) outBuffer[gid] = tmp;\n"
+        "}\n";
+  } else {
+    shader_ +=
+        "    if (writeIt*(unsigned int)tmp) outBuffer[gid] = tmp;\n"
+        "}\n";
+  }
+  // printf("Shader -> %s\n", shader_.c_str());
+}
+
+void OCLPerfSVMSampleRate::setData(void *buffer, unsigned int val) {
+#if defined(CL_VERSION_2_0)
+  error_ = _wrapper->clEnqueueSVMMemFill(
+      cmd_queue_, buffer, &val, sizeof(unsigned int), bufSize_, 0, NULL, NULL);
+  if ((error_ == CL_MEM_OBJECT_ALLOCATION_FAILURE) ||
+      (error_ == CL_OUT_OF_RESOURCES) || (error_ == CL_OUT_OF_HOST_MEMORY)) {
+    error_ = CL_SUCCESS;
+    skip_ = true;
+    testDescString = "Not enough memory, skipped";
+    return;
+  }
+  _wrapper->clFinish(cmd_queue_);
+#endif
+}
+
+void OCLPerfSVMSampleRate::checkData(void *buffer) {
+#if defined(CL_VERSION_2_0)
+  error_ = _wrapper->clEnqueueSVMMap(cmd_queue_, true, CL_MAP_READ, buffer,
+                                     outBufSize_, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueSVMMap failed");
+  float *data = (float *)buffer;
+  for (unsigned int i = 0; i < outBufSize_ / sizeof(float); i++) {
+    if (data[i] != (float)numBufs_) {
+      printf("Data validation failed at %d! Got %f, expected %f\n", i, data[i],
+             (float)numBufs_);
+      break;
+    }
+  }
+  error_ = _wrapper->clEnqueueSVMUnmap(cmd_queue_, buffer, 0, NULL, NULL);
+  _wrapper->clFinish(cmd_queue_);
+#endif
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfSVMSampleRate::open(unsigned int test, char *units,
+                                double &conversion, unsigned int deviceId) {
+  cl_device_id device;
+  error_ = CL_SUCCESS;
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  program_ = 0;
+  kernel_ = 0;
+  cmd_queue_ = 0;
+  inBuffer_ = NULL;
+  outBuffer_ = NULL;
+  coarseGrainBuffer_ = false;
+  fineGrainBuffer_ = false;
+  fineGrainSystem_ = false;
+
+  // We compute a square domain
+  width_ = sizes[test % NUM_SIZES];
+  typeIdx_ = (test / NUM_SIZES) % NUM_TYPES;
+  bufSize_ = width_ * width_ * typeSizes[typeIdx_];
+  numBufs_ = (1 << ((test / (NUM_SIZES * NUM_TYPES)) % NUM_BUFS));
+  svmMode_ = test / (NUM_SIZES * NUM_TYPES * NUM_BUFS);
+
+  device = devices_[deviceId];
+
+#if defined(CL_VERSION_2_0)
+  cl_device_svm_capabilities caps;
+  error_ = clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES,
+                           sizeof(cl_device_svm_capabilities), &caps, NULL);
+  if (svmMode_ == 0) {
+    if (caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) {
+      coarseGrainBuffer_ = true;
+      testdesc = "crs";
+    } else {
+      skip_ = true;  // Should never happen as OCL 2.0 devices are required to
+                     // support coarse grain SVM
+      testDescString = "Coarse grain SVM NOT supported. Test Skipped.";
+      return;
+    }
+  } else if (svmMode_ == 1) {
+    if (caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
+      fineGrainBuffer_ = true;
+      testdesc = "fgb";
+    } else {
+      skip_ = true;  // No support for fine grain buffer SVM
+      testDescString = "Fine grain buffer SVM NOT supported. Test Skipped.";
+      return;
+    }
+  } else if (svmMode_ == 2) {
+    if (caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) {
+      fineGrainSystem_ = true;
+      testdesc = "fgs";
+    } else {
+      skip_ = true;  // No support for fine grain system SVM
+      testDescString = "Fine grain system SVM NOT supported. Test Skipped.";
+      return;
+    }
+  }
+
+  char charbuf[1024];
+
+  cmd_queue_ = cmdQueues_[_deviceId];
+
+  outBufSize_ =
+      sizes[NUM_SIZES - 1] * sizes[NUM_SIZES - 1] * typeSizes[NUM_TYPES - 1];
+  if ((svmMode_ == 0) || (svmMode_ == 1)) {
+    inBuffer_ = (void **)malloc(sizeof(void *) * numBufs_);
+    memset(inBuffer_, 0, sizeof(void *) * numBufs_);
+    cl_mem_flags flags;
+    flags = CL_MEM_READ_ONLY;
+    if (svmMode_ == 1) flags |= CL_MEM_SVM_FINE_GRAIN_BUFFER;
+    for (unsigned int i = 0; i < numBufs_; i++) {
+      inBuffer_[i] = _wrapper->clSVMAlloc(context_, flags, bufSize_, 0);
+      CHECK_RESULT(inBuffer_[i] == NULL, "clCreateBuffer(inBuffer) failed");
+    }
+
+    flags = CL_MEM_WRITE_ONLY;
+    if (svmMode_ == 1) flags |= CL_MEM_SVM_FINE_GRAIN_BUFFER;
+    outBuffer_ = _wrapper->clSVMAlloc(context_, flags, outBufSize_, 0);
+    CHECK_RESULT(outBuffer_ == NULL, "clCreateBuffer(outBuffer) failed");
+  } else {
+    inBuffer_ = (void **)malloc(sizeof(void *) * numBufs_);
+    memset(inBuffer_, 0, sizeof(void *) * numBufs_);
+    for (unsigned int i = 0; i < numBufs_; i++) {
+      inBuffer_[i] = malloc(bufSize_);
+      CHECK_RESULT(inBuffer_[i] == NULL, "malloc(inBuffer) failed");
+    }
+    outBuffer_ = malloc(outBufSize_);
+    CHECK_RESULT(outBuffer_ == NULL, "malloc(outBuffer) failed");
+  }
+
+  setKernel();
+  char *tmp = (char *)shader_.c_str();
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&tmp, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  const char *buildOps = NULL;
+  // Have to force OCL 2.0 to use SVM
+  SNPRINTF(charbuf, sizeof(charbuf), "-cl-std=CL2.0 -D DATATYPE=%s",
+           types[typeIdx_]);
+  buildOps = charbuf;
+  error_ = _wrapper->clBuildProgram(program_, 1, &device, buildOps, NULL, NULL);
+
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "sampleRate", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  error_ = _wrapper->clSetKernelArgSVMPointer(kernel_, 0, outBuffer_);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(outBuffer) failed");
+  unsigned int sizeDW = width_ * width_;
+  error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(unsigned int),
+                                    (void *)&sizeDW);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(sizeDW) failed");
+  unsigned int writeIt = 0;
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(unsigned int),
+                                    (void *)&writeIt);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(writeIt) failed");
+  for (unsigned int i = 0; i < numBufs_; i++) {
+    error_ = _wrapper->clSetKernelArgSVMPointer(kernel_, i + 3, inBuffer_[i]);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(inBuffer) failed");
+    setData(inBuffer_[i], 0x3f800000);
+    if (skip_) return;
+  }
+  setData(outBuffer_, 0xdeadbeef);
+#else
+  skip_ = true;
+  testDescString = "SVM NOT supported for < 2.0 builds. Test Skipped.";
+  return;
+#endif
+}
+
+void OCLPerfSVMSampleRate::run(void) {
+  int global = outBufSize_ / typeSizes[typeIdx_];
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+  unsigned int maxIter = MAX_ITERATIONS * (MAX_BUFS / numBufs_);
+
+  if (skip_) return;
+
+  CPerfCounter timer;
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < maxIter; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+  }
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Test doesn't write anything, so nothing to check
+  // checkData(outBuffer_);
+  // Compute GB/s
+  double perf =
+      ((double)outBufSize_ * NUM_READS * (double)maxIter * (double)(1e-09)) /
+      sec;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), "Domain %dx%d, %2d %s bufs, %6s, %4dx%4d (GB/s)",
+           sizes[NUM_SIZES - 1], sizes[NUM_SIZES - 1], numBufs_,
+           testdesc.c_str(), types[typeIdx_], width_, width_);
+
+  _perfInfo = (float)perf;
+  testDescString = buf;
+}
+
+unsigned int OCLPerfSVMSampleRate::close(void) {
+#if defined(CL_VERSION_2_0)
+  if (cmd_queue_) _wrapper->clFinish(cmd_queue_);
+
+  if ((svmMode_ == 0) || (svmMode_ == 1)) {
+    if (inBuffer_) {
+      for (unsigned int i = 0; i < numBufs_; i++) {
+        if (inBuffer_[i]) {
+          _wrapper->clSVMFree(context_, inBuffer_[i]);
+          CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                                 "clSVMFree(inBuffer_) failed");
+        }
+      }
+      free(inBuffer_);
+    }
+    if (outBuffer_) {
+      _wrapper->clSVMFree(context_, outBuffer_);
+    }
+  } else {
+    if (inBuffer_) {
+      for (unsigned int i = 0; i < numBufs_; i++) {
+        if (inBuffer_[i]) {
+          free(inBuffer_[i]);
+        }
+      }
+      free(inBuffer_);
+    }
+    if (outBuffer_) {
+      free(outBuffer_);
+    }
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+#endif
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMSampleRate.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMSampleRate.h
new file mode 100644
index 0000000000..c388766cdd
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMSampleRate.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_SVMSAMPLERATE_H_
+#define _OCL_SVMSAMPLERATE_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfSVMSampleRate : public OCLTestImp {
+ public:
+  OCLPerfSVMSampleRate();
+  virtual ~OCLPerfSVMSampleRate();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void setData(void* buffer, unsigned int data);
+  void checkData(void* buffer);
+  void setKernel(void);
+
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  void** inBuffer_;
+  void* outBuffer_;
+
+  unsigned int width_;
+  unsigned int bufSize_;
+  unsigned int outBufSize_;
+  static const unsigned int MAX_ITERATIONS = 25;
+  unsigned int numBufs_;
+  unsigned int typeIdx_;
+  unsigned int svmMode_;
+
+  bool skip_;
+  bool coarseGrainBuffer_;
+  bool fineGrainBuffer_;
+  bool fineGrainSystem_;
+  std::string testdesc;
+};
+
+#endif  // _OCL_SVMSAMPLERATE_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSampleRate.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSampleRate.cpp
new file mode 100644
index 0000000000..11ff83b692
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSampleRate.cpp
@@ -0,0 +1,336 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfSampleRate.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_TYPES 3
+static const char *types[NUM_TYPES] = {"float", "float2", "float4"};
+static const unsigned int typeSizes[NUM_TYPES] = {4, 8, 16};
+
+#define NUM_SIZES 12
+static const unsigned int sizes[NUM_SIZES] = {1,  2,   4,   8,   16,   32,
+                                              64, 128, 256, 512, 1024, 2048};
+
+#define NUM_BUFS 6
+#define MAX_BUFS (1 << (NUM_BUFS - 1))
+
+OCLPerfSampleRate::OCLPerfSampleRate() {
+  _numSubTests = NUM_TYPES * NUM_SIZES * NUM_BUFS;
+  skip_ = false;
+}
+
+OCLPerfSampleRate::~OCLPerfSampleRate() {}
+
+void OCLPerfSampleRate::setKernel(void) {
+  shader_.clear();
+  shader_ +=
+      "kernel void sampleRate(global DATATYPE* outBuffer, unsigned int "
+      "inBufSize, unsigned int writeIt,\n";
+  char buf[256];
+  for (unsigned int i = 0; i < numBufs_; i++) {
+    SNPRINTF(buf, sizeof(buf), "global DATATYPE* inBuffer%d", i);
+    shader_ += buf;
+    if (i < (numBufs_ - 1)) {
+      shader_ += ",";
+    }
+    shader_ += "\n";
+  }
+  shader_ += ")\n";
+  shader_ +=
+      "{\n"
+      "    uint gid = get_global_id(0);\n"
+      "    uint inputIdx = gid % inBufSize;\n"
+      "    DATATYPE tmp = (DATATYPE)0.0f;\n";
+
+  for (unsigned int i = 0; i < numBufs_; i++) {
+    SNPRINTF(buf, sizeof(buf), "    tmp += inBuffer%d[inputIdx];\n", i);
+    shader_ += buf;
+  }
+  if (typeSizes[typeIdx_] > 4) {
+    shader_ +=
+        "    if (writeIt*(unsigned int)tmp.x) outBuffer[gid] = tmp;\n"
+        "}\n";
+  } else {
+    shader_ +=
+        "    if (writeIt*(unsigned int)tmp) outBuffer[gid] = tmp;\n"
+        "}\n";
+  }
+  // printf("Shader -> %s\n", shader_.c_str());
+}
+
+void OCLPerfSampleRate::setData(cl_mem buffer, unsigned int val) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  if (data == NULL) {
+    if ((error_ == CL_MEM_OBJECT_ALLOCATION_FAILURE) ||
+        (error_ == CL_OUT_OF_RESOURCES) || (error_ == CL_OUT_OF_HOST_MEMORY)) {
+      printf("WARNING: Not enough memory, skipped\n");
+      error_ = CL_SUCCESS;
+      skip_ = true;
+    } else {
+      CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueMapBuffer failed");
+    }
+    return;
+  }
+  for (unsigned int i = 0; i < bufSize_ / sizeof(unsigned int); i++)
+    data[i] = val;
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+}
+
+void OCLPerfSampleRate::checkData(cl_mem buffer) {
+  float *data = (float *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, buffer, true, CL_MAP_READ, 0, outBufSize_, 0, NULL, NULL,
+      &error_);
+  for (unsigned int i = 0; i < outBufSize_ / sizeof(float); i++) {
+    if (data[i] != (float)numBufs_) {
+      printf("Data validation failed at %d! Got %f, expected %f\n", i, data[i],
+             (float)numBufs_);
+      break;
+    }
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfSampleRate::open(unsigned int test, char *units, double &conversion,
+                             unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  inBuffer_ = 0;
+  outBuffer_ = 0;
+
+  // We compute a square domain
+  width_ = sizes[test % NUM_SIZES];
+  typeIdx_ = (test / NUM_SIZES) % NUM_TYPES;
+  bufSize_ = width_ * width_ * typeSizes[typeIdx_];
+  numBufs_ = (1 << (test / (NUM_SIZES * NUM_TYPES)));
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+    platform = platforms[_platformIndex];
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    delete platforms;
+  }
+  /*
+   * If we could find a platform, use it.
+   */
+  CHECK_RESULT(platform == 0,
+               "Couldn't find platform with GPU devices, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  inBuffer_ = (cl_mem *)malloc(sizeof(cl_mem) * numBufs_);
+  memset(inBuffer_, 0, sizeof(cl_mem) * numBufs_);
+  for (unsigned int i = 0; i < numBufs_; i++) {
+    inBuffer_[i] = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY,
+                                            bufSize_, NULL, &error_);
+    CHECK_RESULT(inBuffer_[i] == 0, "clCreateBuffer(inBuffer) failed");
+  }
+
+  outBufSize_ =
+      sizes[NUM_SIZES - 1] * sizes[NUM_SIZES - 1] * typeSizes[NUM_TYPES - 1];
+  outBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
+                                        outBufSize_, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  setKernel();
+  char *tmp = (char *)shader_.c_str();
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&tmp, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  const char *buildOps = NULL;
+  SNPRINTF(charbuf, sizeof(charbuf), "-D DATATYPE=%s", types[typeIdx_]);
+  buildOps = charbuf;
+  error_ = _wrapper->clBuildProgram(program_, 1, &device, buildOps, NULL, NULL);
+
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "sampleRate", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer_);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(outBuffer) failed");
+  unsigned int sizeDW = width_ * width_;
+  error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(unsigned int),
+                                    (void *)&sizeDW);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(sizeDW) failed");
+  unsigned int writeIt = 0;
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(unsigned int),
+                                    (void *)&writeIt);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(writeIt) failed");
+  for (unsigned int i = 0; i < numBufs_; i++) {
+    error_ = _wrapper->clSetKernelArg(kernel_, i + 3, sizeof(cl_mem),
+                                      (void *)&inBuffer_[i]);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(inBuffer) failed");
+    setData(inBuffer_[i], 0x3f800000);
+    if (skip_) return;
+  }
+  setData(outBuffer_, 0xdeadbeef);
+}
+
+void OCLPerfSampleRate::run(void) {
+  int global = outBufSize_ / typeSizes[typeIdx_];
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+  unsigned int maxIter = MAX_ITERATIONS * (MAX_BUFS / numBufs_);
+
+  if (skip_) return;
+
+  CPerfCounter timer;
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < maxIter; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+  }
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // checkData(outBuffer_);
+  // Compute GB/s
+  double perf =
+      ((double)outBufSize_ * numBufs_ * (double)maxIter * (double)(1e-09)) /
+      sec;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), "Domain %dx%d, %2d bufs, %6s, %4dx%4d (GB/s)",
+           sizes[NUM_SIZES - 1], sizes[NUM_SIZES - 1], numBufs_,
+           types[typeIdx_], width_, width_);
+
+  _perfInfo = (float)perf;
+  testDescString = buf;
+}
+
+unsigned int OCLPerfSampleRate::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+
+  if (inBuffer_) {
+    for (unsigned int i = 0; i < numBufs_; i++) {
+      if (inBuffer_[i]) {
+        error_ = _wrapper->clReleaseMemObject(inBuffer_[i]);
+        CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                               "clReleaseMemObject(inBuffer_) failed");
+      }
+    }
+    free(inBuffer_);
+  }
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSampleRate.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSampleRate.h
new file mode 100644
index 0000000000..93bf6dc6a1
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSampleRate.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_SAMPLERATE_H_
+#define _OCL_SAMPLERATE_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfSampleRate : public OCLTestImp {
+ public:
+  OCLPerfSampleRate();
+  virtual ~OCLPerfSampleRate();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void setData(cl_mem buffer, unsigned int data);
+  void checkData(cl_mem buffer);
+  void setKernel(void);
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_mem* inBuffer_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int width_;
+  unsigned int bufSize_;
+  unsigned int outBufSize_;
+  static const unsigned int MAX_ITERATIONS = 25;
+  unsigned int numBufs_;
+  unsigned int typeIdx_;
+
+  bool skip_;
+};
+
+#endif  // _OCL_SAMPLERATE_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfScalarReplArrayElem.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfScalarReplArrayElem.cpp
new file mode 100644
index 0000000000..922ae44025
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfScalarReplArrayElem.cpp
@@ -0,0 +1,325 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfScalarReplArrayElem.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 1
+static const unsigned int Sizes[NUM_SIZES] = {16777216};  // 16
+
+static void genKernelSource(const char *vtypeName, unsigned arrayLen,
+                            unsigned loopCount, char *source) {
+  sprintf(source,
+          "%s foo(uint lid, __local %s *localLocal)\n"
+          "{\n"
+          "    %s val0 = 0.0f;\n"
+          "    %s val1 = 0.0f;\n"
+          "    for (int i = 0; i < %d; ++i) {\n"
+          "      val0 += localLocal[lid];\n"
+          "      lid += 16;\n"
+          "    }\n"
+          "    %s val = val0+val1;\n"
+          "    return val;\n"
+          "}\n"
+          "__kernel __attribute__((reqd_work_group_size(64,1,1)))"
+          "  void _ldsReadSpeed(__global %s *outBuf)\n"
+          "{\n"
+          "    uint gid = (int) get_global_id(0);\n"
+          "    uint lid = (int) get_local_id(0);\n"
+          "    __local %s localLocal[%d];\n"
+          "    outBuf[gid] = foo(lid, localLocal);\n"
+          "}\n",
+          vtypeName, vtypeName, vtypeName, vtypeName, loopCount, vtypeName,
+          vtypeName, vtypeName, arrayLen);
+}
+
+typedef struct {
+  const char *name;
+  unsigned nBytes;
+} ExplicitType;
+
+static const ExplicitType tyChar = {"char", 1};
+static const ExplicitType tyShort = {"short", 2};
+static const ExplicitType tyInt = {"int", 4};
+static const ExplicitType tyLong = {"long", 8};
+static const ExplicitType tyFloat = {"float", 4};
+static const ExplicitType tyDouble = {"double", 8};
+
+typedef struct {
+  ExplicitType elemType;
+  unsigned nElems;
+  const char *name;
+  unsigned getSize() const { return elemType.nBytes * nElems; }
+} VectorType;
+
+static const VectorType vecTypes[] = {
+    {tyChar, 8, "char8"},     {tyShort, 4, "short4"},   {tyInt, 2, "int2"},
+    {tyFloat, 2, "float2"},   {tyLong, 1, "long"},
+
+    {tyChar, 16, "char16"},   {tyShort, 8, "short8"},   {tyInt, 4, "int4"},
+    {tyFloat, 4, "float4"},   {tyLong, 2, "long2"},
+
+    {tyShort, 16, "short16"}, {tyInt, 8, "int8"},       {tyFloat, 8, "float8"},
+    {tyLong, 4, "long4"},
+
+    {tyInt, 16, "int16"},     {tyFloat, 16, "float16"}, {tyLong, 8, "long8"},
+
+    {tyLong, 16, "long16"}};
+static const unsigned ldsBytes = 4 * 4096;
+static const unsigned nVecTypes = sizeof(vecTypes) / sizeof(VectorType);
+
+void OCLPerfScalarReplArrayElem::genShader(unsigned int idx) {
+  VectorType vecType = vecTypes[idx];
+  ExplicitType elemType = vecType.elemType;
+  unsigned vecSize = vecType.nElems;
+  unsigned arrayLen = ldsBytes / vecType.getSize();
+  unsigned loopCount = arrayLen / 16;
+  char source[7192];
+  genKernelSource(vecType.name, arrayLen, loopCount, source);
+  shader_ = std::string(source);
+  numReads_ = loopCount;
+  itemWidth_ = vecType.getSize();
+}
+
+OCLPerfScalarReplArrayElem::OCLPerfScalarReplArrayElem() {
+  _numSubTests = NUM_SIZES * nVecTypes;
+}
+
+OCLPerfScalarReplArrayElem::~OCLPerfScalarReplArrayElem() {}
+
+void OCLPerfScalarReplArrayElem::setData(cl_mem buffer, float val) {
+  float *data = (float *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true,
+                                                      CL_MAP_WRITE, 0, bufSize_,
+                                                      0, NULL, NULL, &error_);
+  for (unsigned int i = 0; i < (bufSize_ >> 2); i++) data[i] = val;
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+}
+
+void OCLPerfScalarReplArrayElem::checkData(cl_mem buffer) {
+  float *data = (float *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true,
+                                                      CL_MAP_READ, 0, bufSize_,
+                                                      0, NULL, NULL, &error_);
+  for (unsigned int i = 0; i < (bufSize_ >> 2); i++) {
+    if (data[i] != (float)numReads_) {
+      printf("Data validation failed at index %d!\n", i);
+      printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, numReads_,
+             numReads_, numReads_, (unsigned int)data[i],
+             (unsigned int)data[i + 1], (unsigned int)data[i + 2],
+             (unsigned int)data[i + 3]);
+      CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
+      break;
+    }
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfScalarReplArrayElem::open(unsigned int test, char *units,
+                                      double &conversion,
+                                      unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  outBuffer_ = 0;
+  _openTest = test;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    // if (num_devices > 0)
+    //{
+    //    platform = platforms[_platformIndex];
+    //    break;
+    //}
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+
+  width_ = Sizes[test % NUM_SIZES];
+  shaderIdx_ = test / NUM_SIZES;
+  bufSize_ = width_;
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  genShader(shaderIdx_);
+  char *tmp = (char *)shader_.c_str();
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&tmp, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &device, "", NULL, NULL);
+
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "_ldsReadSpeed", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer_);
+
+  // setData(outBuffer_, 1.2345678f);
+}
+
+void OCLPerfScalarReplArrayElem::run(void) {
+  int global = bufSize_ / itemWidth_;
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  CPerfCounter timer;
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < NUM_ITER; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Constant bandwidth in GB/s
+  double perf =
+      ((double)global * numReads_ * itemWidth_ * NUM_ITER * (double)(1e-09)) /
+      sec;
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " %10s %8d threads, %4d reads (GB/s)",
+           vecTypes[shaderIdx_].name, global, numReads_);
+  testDescString = buf;
+  // checkData(outBuffer_);
+}
+
+unsigned int OCLPerfScalarReplArrayElem::close(void) {
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfScalarReplArrayElem.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfScalarReplArrayElem.h
new file mode 100644
index 0000000000..f931c2fc18
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfScalarReplArrayElem.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_ScalarReplArrayElem_H_
+#define _OCL_ScalarReplArrayElem_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfScalarReplArrayElem : public OCLTestImp {
+ public:
+  OCLPerfScalarReplArrayElem();
+  virtual ~OCLPerfScalarReplArrayElem();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void genShader(unsigned int idx);
+  void setData(cl_mem buffer, float data);
+  void checkData(cl_mem buffer);
+
+  static const unsigned int NUM_ITER = 100;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int width_;
+  unsigned int bufSize_;
+  unsigned int numReads_;
+  unsigned int shaderIdx_;
+  unsigned int itemWidth_;
+  unsigned int vecTypeIdx_;
+  unsigned int vecSizeIdx_;
+};
+
+#endif  // _OCL_ScalarReplArrayElem_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSdiP2PCopy.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSdiP2PCopy.cpp
new file mode 100644
index 0000000000..5cad2e0a51
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSdiP2PCopy.cpp
@@ -0,0 +1,261 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfSdiP2PCopy.h"
+
+#include <string.h>
+
+#include "Timer.h"
+
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 5
+// 64KB, 256KB, 1 MB, 4MB, 16 MB
+static const unsigned int Sizes[NUM_SIZES] = {65536, 262144, 1048576, 4194304,
+                                              16777216};
+
+OCLPerfSdiP2PCopy::OCLPerfSdiP2PCopy() {
+  // If there are two different gpus in the system,
+  // we have to test each of them
+  _numSubTests = 2 * NUM_SIZES;
+}
+
+OCLPerfSdiP2PCopy::~OCLPerfSdiP2PCopy() {}
+
+void OCLPerfSdiP2PCopy::open(unsigned int test, char* units, double& conversion,
+                             unsigned int deviceId) {
+  cl_uint numPlatforms = 0;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  _crcword = 0;
+  conversion = 1.0f;
+  _openTest = test % NUM_SIZES;
+  bufSize_ = Sizes[_openTest];
+  error_ = 0;
+  srcBuff_ = 0;
+  inputArr_ = 0;
+  outputArr_ = 0;
+  extPhysicalBuff_ = 0;
+  silentFailure = false;
+  busAddressableBuff_ = 0;
+  devices_[0] = devices_[1] = 0;
+  contexts_[0] = contexts_[1] = 0;
+  cmd_queues_[0] = cmd_queues_[1] = 0;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(numPlatforms == 0, "clGetPlatformIDs failed");
+  error_ = _wrapper->clGetPlatformIDs(1, &platform, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  error_ = _wrapper->clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL,
+                                    &num_devices);
+  if (num_devices != 2) {
+    printf(
+        "\nSilent Failure: Two GPUs are required to run OCLPerfSdiP2PCopy "
+        "test\n");
+    silentFailure = true;
+    return;
+  }
+  error_ = _wrapper->clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices,
+                                    devices_, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+  if (test >= NUM_SIZES) {
+    cl_device_id temp = devices_[0];
+    devices_[0] = devices_[1];
+    devices_[1] = temp;
+  }
+  size_t param_size = 0;
+  char* strExtensions = 0;
+  error_ = _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_EXTENSIONS, 0, 0,
+                                     &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strExtensions = (char*)malloc(param_size);
+  error_ = _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_EXTENSIONS,
+                                     param_size, strExtensions, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strstr(strExtensions, "cl_amd_bus_addressable_memory") == 0) {
+    printf(
+        "\nSilent Failure: cl_amd_bus_addressable_memory extension is not "
+        "enabled on GPU 0\n");
+    silentFailure = true;
+    free(strExtensions);
+    return;
+  }
+  free(strExtensions);
+  error_ = _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_EXTENSIONS, 0, 0,
+                                     &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strExtensions = (char*)malloc(param_size);
+  error_ = _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_EXTENSIONS,
+                                     param_size, strExtensions, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strstr(strExtensions, "cl_amd_bus_addressable_memory") == 0) {
+    printf(
+        "\nSilent Failure: cl_amd_bus_addressable_memory extension is not "
+        "enabled on GPU 1\n");
+    silentFailure = true;
+    free(strExtensions);
+    return;
+  }
+  free(strExtensions);
+  deviceNames_ = " [";
+  param_size = 0;
+  char* strDeviceName = 0;
+  error_ =
+      _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_NAME, 0, 0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strDeviceName = (char*)malloc(param_size);
+  error_ = _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_NAME, param_size,
+                                     strDeviceName, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  deviceNames_ = deviceNames_ + strDeviceName;
+  free(strDeviceName);
+  error_ =
+      _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_NAME, 0, 0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strDeviceName = (char*)malloc(param_size);
+  error_ = _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_NAME, param_size,
+                                     strDeviceName, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  deviceNames_ = deviceNames_ + "->";
+  deviceNames_ = deviceNames_ + strDeviceName;
+  free(strDeviceName);
+  deviceNames_ = deviceNames_ + "]";
+  cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)platform, 0};
+
+  contexts_[0] =
+      _wrapper->clCreateContext(props, 1, &devices_[0], 0, 0, &error_);
+  CHECK_RESULT(contexts_[0] == 0, "clCreateContext failed");
+  contexts_[1] =
+      _wrapper->clCreateContext(props, 1, &devices_[1], 0, 0, &error_);
+  CHECK_RESULT(contexts_[1] == 0, "clCreateContext failed");
+  cmd_queues_[0] =
+      _wrapper->clCreateCommandQueue(contexts_[0], devices_[0], 0, NULL);
+  CHECK_RESULT(cmd_queues_[0] == 0, "clCreateCommandQueue failed");
+  cmd_queues_[1] =
+      _wrapper->clCreateCommandQueue(contexts_[1], devices_[1], 0, NULL);
+  CHECK_RESULT(cmd_queues_[1] == 0, "clCreateCommandQueue failed");
+  busAddressableBuff_ = _wrapper->clCreateBuffer(
+      contexts_[0], CL_MEM_BUS_ADDRESSABLE_AMD, bufSize_, 0, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed");
+  error_ = _wrapper->clEnqueueMakeBuffersResidentAMD(
+      cmd_queues_[0], 1, &busAddressableBuff_, true, &busAddr_, 0, 0, 0);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clEnqueueMakeBuffersResidentAMD failed");
+  extPhysicalBuff_ = _wrapper->clCreateBuffer(
+      contexts_[1], CL_MEM_EXTERNAL_PHYSICAL_AMD, bufSize_, &busAddr_, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed");
+  srcBuff_ = _wrapper->clCreateBuffer(contexts_[1], CL_MEM_READ_WRITE, bufSize_,
+                                      0, &error_);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clCreateBuffer failed");
+  inputArr_ = (cl_uint*)malloc(bufSize_);
+  outputArr_ = (cl_uint*)malloc(bufSize_);
+  for (unsigned int i = 0; i < (bufSize_ / sizeof(cl_uint)); ++i) {
+    inputArr_[i] = i + 1;
+    outputArr_[i] = 0;
+  }
+  error_ = _wrapper->clEnqueueWriteBuffer(cmd_queues_[1], srcBuff_, CL_TRUE, 0,
+                                          bufSize_, inputArr_, 0, 0, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteBuffer failed");
+}
+
+void OCLPerfSdiP2PCopy::run(void) {
+  if (silentFailure) {
+    return;
+  }
+  CPerfCounter timer;
+  // Warm up
+  error_ =
+      _wrapper->clEnqueueCopyBuffer(cmd_queues_[1], srcBuff_, extPhysicalBuff_,
+                                    0, 0, bufSize_, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueCopyBuffer failed");
+  error_ = _wrapper->clFinish(cmd_queues_[1]);
+  CHECK_RESULT(error_, "clFinish failed");
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < NUM_ITER; i++) {
+    error_ = _wrapper->clEnqueueCopyBuffer(cmd_queues_[1], srcBuff_,
+                                           extPhysicalBuff_, 0, 0, bufSize_, 0,
+                                           NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueCopyBuffer failed");
+  }
+  error_ = _wrapper->clFinish(cmd_queues_[1]);
+  CHECK_RESULT(error_, "clFinish failed");
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+  error_ = _wrapper->clEnqueueReadBuffer(cmd_queues_[0], busAddressableBuff_,
+                                         CL_TRUE, 0, bufSize_, outputArr_, 0, 0,
+                                         NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteBuffer failed");
+  CHECK_RESULT((memcmp(inputArr_, outputArr_, bufSize_) != 0), "copy failed");
+  // Buffer copy bandwidth in GB/s
+  double perf = ((double)bufSize_ * NUM_ITER * (double)(1e-09)) / sec;
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%8d bytes) i:%4d (GB/s) %s", bufSize_, NUM_ITER,
+           deviceNames_.c_str());
+  testDescString = buf;
+}
+
+unsigned int OCLPerfSdiP2PCopy::close(void) {
+  if (srcBuff_) {
+    error_ = _wrapper->clReleaseMemObject(srcBuff_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject failed");
+  }
+  if (extPhysicalBuff_) {
+    error_ = _wrapper->clReleaseMemObject(extPhysicalBuff_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject failed");
+  }
+  if (busAddressableBuff_) {
+    error_ = _wrapper->clReleaseMemObject(busAddressableBuff_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject failed");
+  }
+  if (cmd_queues_[0]) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queues_[0]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (cmd_queues_[1]) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queues_[1]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (contexts_[0]) {
+    error_ = _wrapper->clReleaseContext(contexts_[0]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+  if (contexts_[1]) {
+    error_ = _wrapper->clReleaseContext(contexts_[1]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+  if (inputArr_) {
+    free(inputArr_);
+  }
+  if (outputArr_) {
+    free(outputArr_);
+  }
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSdiP2PCopy.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSdiP2PCopy.h
new file mode 100644
index 0000000000..be0ef5e7b0
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSdiP2PCopy.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_SdiP2PCopy_H_
+#define _OCL_SdiP2PCopy_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfSdiP2PCopy : public OCLTestImp {
+ public:
+  OCLPerfSdiP2PCopy();
+  virtual ~OCLPerfSdiP2PCopy();
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  static const unsigned int NUM_ITER = 1024;
+  bool silentFailure;
+  cl_context contexts_[2];
+  cl_device_id devices_[2];
+  cl_command_queue cmd_queues_[2];
+  cl_mem srcBuff_;
+  cl_mem extPhysicalBuff_;
+  cl_mem busAddressableBuff_;
+  cl_int error_;
+  cl_bus_address_amd busAddr_;
+  cl_uint* inputArr_;
+  cl_uint* outputArr_;
+  unsigned int bufSize_;
+  std::string deviceNames_;
+};
+
+#endif  // _OCL_SdiP2PCopy_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSepia.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSepia.cpp
new file mode 100644
index 0000000000..746cfbecb5
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSepia.cpp
@@ -0,0 +1,586 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfSepia.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+#define WIDTH 1024
+#define HEIGHT 1024
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define MAX(a, b) (a > b ? a : b)
+
+const char *sepiaVertexProgram =
+    "!!ARBvp1.0\n"
+    "\n"
+    "\n"
+    "OPTION ARB_position_invariant;\n"
+    "\n"
+    "PARAM p0 = program.local[2];\n"
+    "PARAM p1 = program.local[3];\n"
+    "ATTRIB a0 = vertex.texcoord[0];\n"
+    "OUTPUT o0 = result.texcoord[0];\n"
+    "OUTPUT o1 = result.texcoord[1];\n"
+    "TEMP r0, r1;\n"
+    "\n"
+    "MOV o0, a0;\n"
+    "#SWZ r1, a0, x, y, 0, 0;\n"
+    "#DPH r0.x, r1, p0;\n"
+    "#DPH r0.y, r1, p1;\n"
+    "#MOV o1, r0;\n"
+    "MOV o1, a0;\n"
+    "\n"
+    "END\n";
+
+const char *sepiaFragmentProgram =
+    "!!ARBfp1.0\n"
+    "\n"
+    "\n"
+    "PARAM p0 = {1e-4, 0.085, 0.0, 0.0};\n"
+    "PARAM p1 = {0.2125, 0.7154, 0.0721, 0.0};\n"
+    "PARAM p2 = {-3605.984, 0.1323156, 0.0, -0.1991615};\n"
+    "PARAM p3 = {708.7939, -0.3903106, -0.05854013, 0.6621023};\n"
+    "PARAM p4 = {-50.93341, 0.4654831, 1.027555, -0.9069088};\n"
+    "PARAM p5 = {3.116672, 0.7926372, 0.03219686, 1.411847};\n"
+    "PARAM p6 = {8.95663e-4, -0.001104567, -6.0827e-4, 0.03277428};\n"
+    "PARAM p7 = program.local[0];\n"
+    "PARAM p8 = program.local[1];\n"
+    "ATTRIB a0 = fragment.texcoord[1];\n"
+    "OUTPUT o0 = result.color;\n"
+    "TEMP r0, r1, r2, r3;\n"
+    "\n"
+    "TEX r1, a0, texture[0], RECT;\n"
+    "#MAX r0, p0.x, r1.w;\n"
+    "#RCP r2, r0.x;\n"
+    "#DP3 r3, r1, p1;\n"
+    "#MUL r0, r3, r2;\n"
+    "#MAD r2, r0, p2, p3;\n"
+    "#MAD r2, r2, r0, p4;\n"
+    "#MAD r0, r2, r0, p5;\n"
+    "#MUL r2, r1.w, p6;\n"
+    "#MAD r2, r0, r3, r2;\n"
+    "#MAD r0, r1.w, p0.y, -r3;\n"
+    "#CMP r2.x, -r0, r2.x, r2.w;\n"
+    "#MAD r0, r3, r3, -r3;\n"
+    "#CMP r0, r0.x, r2, r3;\n"
+    "#MOV r0.w, r1;\n"
+    "#MUL r0, r0, p7;\n"
+    "#LRP o0, p8.x, r0, r1;\n"
+    "MOV o0, r1;\n"
+    "\n"
+    "END\n";
+
+const static char *strKernel =
+    "\n"
+    "__kernel void program(write_only image2d_t dest, int flipped, int4 dim, "
+    "float2 st_origin, float4 st_delta, float4 l0, float4 l1, float4 l2, "
+    "float4 l3, read_only image2d_t t0, sampler_t t_sampler0)\n"
+    "{\n"
+    "      const sampler_t sam = CLK_NORMALIZED_COORDS_FALSE | "
+    "CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
+    "//    const float4 p0  = (float4)( 0x1.b33334p-3, 0x1.6e48e8p-1, "
+    "0x1.275254p-4, 0x0p+0 );\n"
+    "//    const float4 p1  = (float4)( 0x1.a36e2ep-14, 0x1.5c28f6p-4, 0x0p+0, "
+    "0x0p+0 );\n"
+    "//    const float4 p2  = (float4)( 0x1.d595dap-11, -0x1.218e3cp-10, "
+    "-0x1.3ee89ep-11, 0x1.0c7ca6p-5 );\n"
+    "//    const float4 p3  = (float4)( -0x1.c2bf7cp+11, 0x1.0efb7cp-3, "
+    "0x0p+0, -0x1.97e1fcp-3 );\n"
+    "//    const float4 p4  = (float4)( 0x1.62659ep+9, -0x1.8fad94p-2, "
+    "-0x1.df8f8cp-5, 0x1.52ff12p-1 );\n"
+    "//   const float4 p5  = (float4)( -0x1.9777ap+5, 0x1.dca79ap-2, "
+    "0x1.070dd8p+0, -0x1.d0565ap-1 );\n"
+    "//    const float4 p6  = (float4)( 0x1.8eef1cp+1, 0x1.95d48cp-1, "
+    "0x1.07c1b6p-5, 0x1.696ecep+0 );\n"
+    "//    int          dest_width = dim.x;\n"
+    "//    int          dest_height = dim.y;\n"
+    "    float4       o0, r0, r1, r2, r3, r4;\n"
+    "//    float4       false_vector = (float4) 0.0f;\n"
+    "//    float4       true_vector = (float4) 1.0f;\n"
+    "    int2         loc = (int2)( get_global_id(0), get_global_id(1) );\n"
+    "//    if ((loc.x >= dim.x) || loc.y >= dim.y) return;\n"
+    "//    float4 f0 = (float4)( st_origin.x + ((float)loc.x + 0.5f) * "
+    "st_delta.x + ((float)loc.y + 0.5f) * st_delta.z, st_origin.y + "
+    "((float)loc.x + 0.5f) * st_delta.y + ((float)loc.y + 0.5f) * st_delta.w, "
+    "0.0f, 0.0f );\n"
+    "//    r2 = f0;\n"
+    "//    r0.x = dot(r2.xy,l2.xy) + l2.w;\n"
+    "//    r0.y = dot(r2.xy,l3.xy) + l3.w;\n"
+    "//    r4 = r0;\n"
+    "    r1 = read_imagef(t0, sam/*t_sampler0*/, r4.xy);\n"
+    "//    r3 = dot(r1.xyz,p0.xyz);\n"
+    "//    r2 = max(p1.xxxx, r1.wwww);\n"
+    "//    r0 = native_recip(r2.xxxx);\n"
+    "//    r4 = r3*r0;\n"
+    "//    r2 = r1.wwww*p2;\n"
+    "//    r0 = mad(r4,p3,p4);\n"
+    "//    r0 = mad(r0,r4,p5);\n"
+    "//    r0 = mad(r0,r4,p6);\n"
+    "//    r2 = mad(r0,r3,r2);\n"
+    "//    r0 = mad(r1.wwww,p1.yyyy,-r3);\n"
+    "//    r2.x = select(r2.w,r2.x, isless(-r0.x, 0.0f));\n"
+    "//    r0 = mad(r3,r3,-r3);\n"
+    "//    r0 = select(r3,r2, isless(r0.xxxx, 0.0f));\n"
+    "//    r0.w = r1.w;\n"
+    "//    r0 = r0*l0;\n"
+    "//    r0 = mix(r1,r0, l1.xxxx);\n"
+    "//    r0.xyz = min(r0.xyz, r0.www);\n"
+    "//    o0 = r0;\n"
+    "    write_imagef(dest, loc /*(int2)( loc.x + dim.z , flipped ? "
+    "get_image_height(dest) - (loc.y + dim.w + 1) : loc.y + dim.w )*/, r1 "
+    "/*o0*/);\n"
+    "}\n";
+
+OCLPerfSepia::OCLPerfSepia() { _numSubTests = 2; }
+
+OCLPerfSepia::~OCLPerfSepia() {}
+
+void OCLPerfSepia::open(unsigned int test, char *units, double &conversion,
+                        unsigned int deviceId) {
+  bVerify_ = false;
+  silentFailure_ = false;
+  iterations_ = 50000;
+  bpr_ = 0;
+  data_ = 0;
+  result_ = 0;
+  width_ = 0;
+  height_ = 0;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+  texId = 0;
+  format_.image_channel_order = CL_RGBA;
+  format_.image_channel_data_type = CL_UNORM_INT8;
+
+  srand(0x8956);  // some constant instead of time() so that we get same random
+                  // numbers
+
+  if (!IsGLEnabled(test, units, conversion, deviceId)) {
+    silentFailure_ = true;
+    return;
+  }
+  OCLGLCommon::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+  if (test == 0) {
+    // Build the kernel
+    program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel,
+                                                   NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "clCreateProgramWithSource()  failed (%d)", error_);
+    const char *optionsGPU = "-cl-denorms-are-zero -cl-mad-enable";
+    error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                      optionsGPU, NULL, NULL);
+    if (error_ != CL_SUCCESS) {
+      char programLog[1024];
+      _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                      CL_PROGRAM_BUILD_LOG, 1024, programLog,
+                                      0);
+      printf("\n%s\n", programLog);
+      fflush(stdout);
+    }
+    CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)",
+                 error_);
+
+    kernel_ = _wrapper->clCreateKernel(program_, "program", &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)",
+                 error_);
+  }
+}
+
+void OCLPerfSepia::populateData(void) {
+  width_ = WIDTH;
+  height_ = HEIGHT;
+  bpr_ = 4 * width_;
+  data_ = (cl_uchar *)malloc(height_ * bpr_);
+  for (unsigned int n = 0; n < (height_ * bpr_); n++) {
+    data_[n] = (n & 3) ? (rand() % 256) : 0xFF;
+  }
+}
+
+void OCLPerfSepia::runGL(void) {
+  glDisable(GL_ALPHA_TEST);
+  glDisable(GL_DEPTH_TEST);
+  glDisable(GL_SCISSOR_TEST);
+  glDisable(GL_BLEND);
+  glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
+  glDisable(GL_DITHER);
+  glDisable(GL_CULL_FACE);
+  glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
+  glDepthMask(GL_FALSE);
+  glStencilMask(0);
+
+  glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
+
+  // let's create the textures we need
+
+  glEnable(GL_TEXTURE_RECTANGLE_EXT);
+  glGenTextures(1, &texId);
+  glBindTexture(GL_TEXTURE_RECTANGLE_EXT, texId);
+
+  // have GL alloc memory for us for our destination texture which we will be
+  // rendering into
+  glTexImage2D(GL_TEXTURE_RECTANGLE_EXT, 0, GL_RGBA, width_, height_, 0,
+               GL_BGRA /*RGBA*/, GL_UNSIGNED_INT_8_8_8_8_REV, NULL);
+  glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+
+  // for the source texture we will provide a data ptr and hang on to it
+  GLuint srcTexture;
+
+  glGenTextures(1, &srcTexture);
+  glBindTexture(GL_TEXTURE_RECTANGLE_EXT, srcTexture);
+
+  glPixelStorei(GL_UNPACK_ROW_LENGTH, width_);
+  glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, height_);
+  glPixelStorei(GL_UNPACK_ALIGNMENT, 8);
+
+  // XXX Alex -- use optimal texture upload format.
+  glTexImage2D(GL_TEXTURE_RECTANGLE_EXT, 0, GL_RGBA, width_, height_, 0,
+               GL_BGRA, /* GL_RGBA,*/
+               format_.image_channel_order == CL_RGBA
+                   ? GL_UNSIGNED_INT_8_8_8_8
+                   : GL_UNSIGNED_INT_8_8_8_8_REV,
+               data_);
+
+  glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+  glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_S,
+                  GL_CLAMP_TO_EDGE);
+  glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_T,
+                  GL_CLAMP_TO_EDGE);
+  glPixelStorei(GL_UNPACK_SWAP_BYTES, 0);
+  glPixelStorei(GL_UNPACK_LSB_FIRST, 0);
+  glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
+  glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, 0);
+  glPixelStorei(GL_UNPACK_SKIP_PIXELS, 0);
+  glPixelStorei(GL_UNPACK_SKIP_IMAGES, 0);
+  glPixelStorei(GL_UNPACK_SKIP_ROWS, 0);
+  glPixelStorei(GL_UNPACK_ALIGNMENT, 4);
+
+  GLuint vertexProgram;
+  GLuint fragmentProgram;
+
+  glGenProgramsARB(1, &vertexProgram);
+  glGenProgramsARB(1, &fragmentProgram);
+
+  glBindProgramARB(GL_VERTEX_PROGRAM_ARB, vertexProgram);
+  glProgramStringARB(GL_VERTEX_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
+                     (GLsizei)strlen(sepiaVertexProgram), sepiaVertexProgram);
+
+  glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, fragmentProgram);
+  glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
+                     (GLsizei)strlen(sepiaFragmentProgram),
+                     sepiaFragmentProgram);
+
+  GLfloat l0[] = {1.0f, 0.99f, 0.92f, 1.0f};
+  GLfloat l1[] = {0.5, 0, 0, 0};
+  GLfloat l2[] = {1, 0, 0, 0};
+  GLfloat l3[] = {0, -1, 0, (GLfloat)height_};
+
+  glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, 0, l0);
+  glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, 1, l1);
+  glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, 2, l2);
+  glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, 3, l3);
+
+  glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 0, l0);
+  glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 1, l1);
+  glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 2, l2);
+  glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 3, l3);
+
+  GLuint fbo;
+
+  glGenFramebuffersEXT(1, &fbo);
+
+  glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, fbo);
+
+  glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT,
+                            GL_TEXTURE_RECTANGLE_ARB, texId, 0);
+  glViewport(0, 0, width_, height_);
+  glMatrixMode(GL_PROJECTION);
+  glLoadIdentity();
+  glOrtho(0, width_, 0, height_, -1, 1);
+  glClearColor(0, 0, 0, 0);
+  glClear(GL_COLOR_BUFFER_BIT);
+  glDisable(GL_BLEND);
+
+  glEnable(GL_VERTEX_PROGRAM_ARB);
+  glEnable(GL_FRAGMENT_PROGRAM_ARB);
+
+  // warm up
+  for (unsigned int k = 0; k < (iterations_ / 10); k++) {
+    glBegin(GL_QUADS);
+    glTexCoord2f(0, 0);
+    glVertex2f(0, (GLfloat)height_);
+    glTexCoord2f((GLfloat)width_, 0);
+    glVertex2f((GLfloat)width_, (GLfloat)height_);
+    glTexCoord2f((GLfloat)width_, (GLfloat)height_);
+    glVertex2f((GLfloat)width_, 0);
+    glTexCoord2f(0, (GLfloat)height_);
+    glVertex2f(0, 0);
+    glEnd();
+    glFlush();
+    glFinish();
+  }
+
+  // actual test
+  for (unsigned int k = 0; k < iterations_; k++) {
+    if (k == 1) {
+      timer_.Reset();
+      timer_.Start();
+    }
+
+    glBegin(GL_QUADS);
+    glTexCoord2f(0, 0);
+    glVertex2f(0, (GLfloat)height_);
+    glTexCoord2f((GLfloat)width_, 0);
+    glVertex2f((GLfloat)width_, (GLfloat)height_);
+    glTexCoord2f((GLfloat)width_, (GLfloat)height_);
+    glVertex2f((GLfloat)width_, 0);
+    glTexCoord2f(0, (GLfloat)height_);
+    glVertex2f(0, 0);
+    glEnd();
+  }
+
+  glFlush();
+  glFinish();
+
+  timer_.Stop();
+
+  glDisable(GL_VERTEX_PROGRAM_ARB);
+  glDisable(GL_FRAGMENT_PROGRAM_ARB);
+
+  // now let's read back the pixels
+  result_ = (cl_uchar *)malloc(width_ * height_ * 4);
+
+  glReadPixels(0, 0, width_, height_, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV,
+               result_);
+
+  // bind back default frame buffer
+  glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0);
+
+  glDeleteFramebuffersEXT(1, &fbo);
+  glDeleteTextures(1, &srcTexture);
+  glDeleteProgramsARB(1, &vertexProgram);
+  glDeleteProgramsARB(1, &fragmentProgram);
+}
+
+void OCLPerfSepia::runCL(void) {
+  cl_mem dst, src;
+  cl_sampler nearestZero;
+
+  glEnable(GL_TEXTURE_RECTANGLE_EXT);
+  glGenTextures(1, &texId);
+  glBindTexture(GL_TEXTURE_RECTANGLE_EXT, texId);
+  // XXX Alex: have GL alloc memory for us ...
+  glTexImage2D(GL_TEXTURE_RECTANGLE_EXT, 0, GL_RGBA, width_, height_, 0,
+               GL_RGBA /*BGRA*/, GL_UNSIGNED_INT_8_8_8_8_REV, NULL);
+
+  dst = _wrapper->clCreateFromGLTexture2D(
+      context_, CL_MEM_READ_WRITE, GL_TEXTURE_RECTANGLE_EXT, 0, texId, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateFromGLTexture2D error (%d)",
+               error_);
+  nearestZero = _wrapper->clCreateSampler(context_, CL_FALSE, CL_ADDRESS_CLAMP,
+                                          CL_FILTER_NEAREST, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateSampler error (%d)", error_);
+  src = _wrapper->clCreateImage2D(
+      context_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &format_, width_,
+      height_, bpr_, data_, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateImage2D error (%d)", error_);
+
+  int numArgs = 0;
+  int dim[2] = {(int)width_, (int)height_};
+  int flipped[] = {1};
+  int dims[] = {(int)width_, (int)height_, 0, 0};
+  float st_origin[] = {0, 0};
+  float st_delta[] = {1, 0, 0, 1};
+
+  _wrapper->clSetKernelArg(kernel_, numArgs++, sizeof(cl_mem),
+                           &dst);  // arg is a image2DGL named "dst"
+  _wrapper->clSetKernelArg(kernel_, numArgs++, sizeof(int),
+                           &flipped);  // arg is a int1 named "flipped"
+  _wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(int),
+                           &dims);  // arg is a int4 named "dim"
+  _wrapper->clSetKernelArg(kernel_, numArgs++, 2 * sizeof(float),
+                           &st_origin);  // arg is a float2 named "st_origin"
+  _wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float),
+                           &st_delta);  // arg is a float4 named "st_delta"
+
+  float l0[] = {1.0f, 0.99f, 0.92f, 1.0f};
+  float l1[] = {0.5f, 0.0f, 0.0f, 0.0f};
+  float l2[] = {1.0f, 0.0f, 0.0f, 0.0f};
+  float l3[] = {0.0f, -1.0f, 0.0f, (float)height_};
+
+  _wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float),
+                           &l0);  // arg is a float4 named "l0"
+  _wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float),
+                           &l1);  // arg is a float4 named "l1"
+  _wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float),
+                           &l2);  // arg is a float4 named "l2"
+  _wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float),
+                           &l3);  // arg is a float4 named "l3"
+  _wrapper->clSetKernelArg(kernel_, numArgs++, sizeof(cl_mem),
+                           &src);  // arg is a image2D named "t0"
+  _wrapper->clSetKernelArg(
+      kernel_, numArgs++, sizeof(cl_sampler),
+      &nearestZero);  // arg is a sampler named "t_sampler0"
+
+  size_t execution_threads[2];
+  size_t execution_local[2];
+  cl_uint work_dim = 2;
+  error_ = _wrapper->clGetKernelWorkGroupInfo(
+      kernel_, devices_[_deviceId], CL_KERNEL_WORK_GROUP_SIZE,
+      sizeof(execution_local[0]), &execution_local[0], 0);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clGetKernelWorkGroupInfo error (%d)",
+               error_);
+  execution_local[1] = 1;
+  work_dim = 2;
+  GetKernelExecDimsForImage((unsigned int)execution_local[0], dim[0], dim[1],
+                            execution_threads, execution_local);
+  result_ = (cl_uchar *)malloc(height_ * bpr_);
+
+  const size_t origin[] = {0, 0, 0};
+  const size_t region[] = {width_, height_, 1};
+
+  // warm up
+  for (unsigned int k = 0; k < (iterations_ / 10); k++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_,
+                                              work_dim, NULL, execution_threads,
+                                              execution_local, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel error (%d)",
+                 error_);
+    error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clFinish error (%d)", error_);
+  }
+
+  // actual test
+  for (unsigned int k = 0; k < iterations_; k++) {
+    if (k == 1) {
+      timer_.Reset();
+      timer_.Start();
+    }
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_,
+                                              work_dim, NULL, execution_threads,
+                                              execution_local, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel error (%d)",
+                 error_);
+  }
+  error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clFinish error (%d)", error_);
+
+  timer_.Stop();
+
+  error_ =
+      _wrapper->clEnqueueReadImage(cmdQueues_[_deviceId], dst, true, origin,
+                                   region, bpr_, 0, result_, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadImage error (%d)", error_);
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  _wrapper->clReleaseMemObject(src), src = NULL;
+  _wrapper->clReleaseSampler(nearestZero);
+  _wrapper->clReleaseMemObject(dst), dst = NULL;
+}
+
+void OCLPerfSepia::GetKernelExecDimsForImage(unsigned int work_group_size,
+                                             unsigned int w, unsigned int h,
+                                             size_t *global, size_t *local) {
+  unsigned int a, b;
+  static const unsigned int tile_size = 16;
+
+  // local[0] and local[1] must be at least 1
+  local[0] = tile_size < work_group_size ? tile_size : work_group_size;
+  local[1] = work_group_size / tile_size > tile_size
+                 ? tile_size
+                 : MAX(work_group_size / tile_size, 1);
+
+  a = w;
+  b = (unsigned int)local[0];
+
+  global[0] = ((a % b) != 0) ? (a / b + 1) : (a / b);
+  global[0] *= local[0];
+
+  a = h;
+  b = (unsigned int)local[1];
+
+  global[1] = ((a % b) != 0) ? (a / b + 1) : (a / b);
+  global[1] *= local[1];
+}
+
+void OCLPerfSepia::run(void) {
+  if (_errorFlag || silentFailure_) {
+    return;
+  }
+  populateData();
+  if (_openTest == 0) {
+    runCL();
+  } else {
+    runGL();
+  }
+  if (bVerify_) {
+    verifyResult();
+  }
+  char buf[100];
+  SNPRINTF(buf, sizeof(buf), "%s iterations# %d",
+           (_openTest == 0) ? "CL" : "GL", iterations_);
+  testDescString = buf;
+  _perfInfo = (float)timer_.GetElapsedTime();
+}
+
+void OCLPerfSepia::verifyResult(void) {
+  int r = 0, g = 0, b = 0, a = 0, d = 0;
+  for (unsigned int k = 0; k < height_ * bpr_; k += 4) {
+    a = a + result_[k + 0];
+    r = r + result_[k + 1];
+    g = g + result_[k + 2];
+    b = b + result_[k + 3];
+  }
+  d = abs(r - 152797810) + abs(g - 125868080) + abs(b - 76147833) +
+      abs(a - 267386880);
+  CHECK_RESULT(d > 20000, "wrong result");
+}
+unsigned int OCLPerfSepia::close(void) {
+  if (silentFailure_) {
+    return 0;
+  }
+
+  if (data_) {
+    free(data_);
+  }
+
+  if (result_) {
+    free(result_);
+  }
+
+  if (texId) {
+    glDeleteTextures(1, &texId);
+  }
+
+  return OCLGLCommon::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSepia.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSepia.h
new file mode 100644
index 0000000000..0103060009
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSepia.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PERF_SEPIA_H_
+#define _OCL_PERF_SEPIA_H_
+
+#include "OCLGLCommon.h"
+#include "Timer.h"
+
+class OCLPerfSepia : public OCLGLCommon {
+ public:
+  OCLPerfSepia();
+  virtual ~OCLPerfSepia();
+
+  virtual void open(unsigned int test, char *units, double &conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  void runGL(void);
+  void runCL(void);
+  void populateData(void);
+  void verifyResult(void);
+  void GetKernelExecDimsForImage(unsigned int work_group_size, unsigned int w,
+                                 unsigned int h, size_t *global, size_t *local);
+
+  bool silentFailure_;
+  cl_uint iterations_;
+  cl_image_format format_;
+  cl_uchar *data_;
+  cl_uchar *result_;
+  bool bVerify_;
+  cl_uint width_;
+  cl_uint height_;
+  cl_uint bpr_;
+  GLuint texId;
+  CPerfCounter timer_;
+};
+
+#endif  // _OCL_PERF_SEPIA_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfTextureMemLatency.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfTextureMemLatency.cpp
new file mode 100644
index 0000000000..d4e80ba044
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfTextureMemLatency.cpp
@@ -0,0 +1,409 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfTextureMemLatency.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+static const unsigned int NUM_SIZES = 13;
+// 2k up to 64MB
+static const cl_uint2 Dims[NUM_SIZES] = {
+    {{32, 16}},    {{32, 32}},     {{64, 32}},    {{64, 64}},   {{128, 64}},
+    {{128, 128}},  {{256, 128}},   {{256, 256}},  {{512, 256}}, {{512, 512}},
+    {{1024, 512}}, {{1024, 1024}}, {{2048, 1024}}};
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+void OCLPerfTextureMemLatency::genShader() {
+  shader_.clear();
+
+  // Adopted from SiSoft Sandra 2013's memory latency test
+  shader_ +=
+      "constant sampler_t insample = CLK_NORMALIZED_COORDS_FALSE | "
+      "CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
+      "__kernel\n"
+      "__attribute__((work_group_size_hint(1, 1, 1)))\n"
+      "void MemWalker(\n"
+      "    read_only image2d_t input,\n"
+      "    __global uint * restrict output,\n"
+      "    const uint uCount,  const uint uSize,\n"
+      "    const uint4 uOffset, const int bMem, const uint repeats)\n"
+      "{\n"
+      "    uint4 o = uOffset;\n"
+      "    uint lid = get_local_id(0);\n"
+      "    uint4 x = lid*o;\n"
+      "\n"
+      "    for (uint loop = 0; (loop < repeats); loop++) {\n"
+      "        uint i = uCount;\n"
+      "        int2 nx = (int2)(0,0);\n"
+      "        nx = (int2)((x.y << 8) | x.x, (x.w << 8) | x.z);\n"
+      "        while (i--) {\n"
+      "            x = read_imageui(input, insample, nx);\n"
+      "            x.x += o.x;\n"
+      "            x.z += o.z;\n"
+      "            nx = (int2)((x.y << 8) | x.x, (x.w << 8) | x.z);\n"
+      "        }\n"
+      "    }\n"
+      "\n"
+      "    output[0] = x.x + x.y;\n"
+      "}\n";
+
+  // printf("shader:\n%s\n", shader_.c_str());
+  shader_ += "\n\n";
+  shader_ +=
+      "__kernel\n"
+      "__attribute__((work_group_size_hint(1, 1, 1)))\n"
+      "void Overhead(\n"
+      "    read_only image2d_t input,\n"
+      "    __global uint * restrict output,\n"
+      "    const uint uCount,  const uint uSize,\n"
+      "    const uint4 uOffset, const int bMem, const uint repeats)\n"
+      "{\n"
+      "    uint4 o = uOffset;\n"
+      "    uint lid = get_local_id(0);\n"
+      "    uint4 x = lid*o;\n"
+      "    x += o;\n"
+      "    int2 nx;\n"
+      "    for (uint loop = 0; loop < repeats; loop++) {\n"
+      "        uint i = uCount;\n"
+      "        nx = (int2)(0,0);\n"
+      "        nx = (int2)((x.y << 8) | x.x, (x.w << 8) | x.z);\n"
+      "        while (i--) {\n"
+      "            x.x = nx.x  + o.x;\n"
+      "            x.z = nx.y  + o.y;\n"
+      "            nx = (int2)((x.y << 8) | x.x, (x.w << 8) | x.z);\n"
+      "        }\n"
+      "    }\n"
+      "    output[0] = nx.x | nx.y;\n"
+      "}\n";
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+OCLPerfTextureMemLatency::OCLPerfTextureMemLatency() {
+  _numSubTests = NUM_SIZES;
+  maxSize_ = Dims[NUM_SIZES - 1].s[0] * Dims[NUM_SIZES - 1].s[1];
+}
+
+OCLPerfTextureMemLatency::~OCLPerfTextureMemLatency() {}
+
+void OCLPerfTextureMemLatency::setData(cl_mem buffer, unsigned int val) {
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {width_, height_, 1};
+
+  void *ptr = _wrapper->clEnqueueMapImage(
+      cmd_queue_, buffer, true, CL_MAP_WRITE, origin, region, &image_row_pitch,
+      &image_slice_pitch, 0, NULL, NULL, &error_);
+  CHECK_RESULT(error_, "clEnqueueMapImage failed.");
+  unsigned int *data = (unsigned int *)ptr;
+  unsigned int nextOffset = 0;
+  for (unsigned int i = 0; i < bufSizeDW_; i++) {
+    unsigned int offset = ((1024 + 17) * (i + 1)) % bufSizeDW_;
+    unsigned int x, y;
+    x = offset % width_;
+    y = offset / width_;
+    unsigned int newx, newy;
+    newx = nextOffset % width_;
+    newy = nextOffset / width_;
+    data[newy * image_row_pitch / sizeof(unsigned int) + newx] =
+        (y << 16) | (x & 0xffff);
+    nextOffset = offset;
+  }
+  error_ =
+      _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL);
+  clFinish(cmd_queue_);
+}
+
+void OCLPerfTextureMemLatency::checkData(cl_mem buffer) {
+  void *ptr =
+      _wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_READ, 0,
+                                   sizeof(cl_uint), 0, NULL, NULL, &error_);
+
+  unsigned int *data = (unsigned int *)ptr;
+  if (data[0] != 0) {
+    printf("OutData= 0x%08x\n", data[0]);
+    CHECK_RESULT_NO_RETURN(data[0] != 0, "Data validation failed!\n");
+  }
+  error_ =
+      _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL);
+  clFinish(cmd_queue_);
+}
+
+void OCLPerfTextureMemLatency::open(unsigned int test, char *units,
+                                    double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  inBuffer_ = 0;
+  outBuffer_ = 0;
+  _errorFlag = false;  // Reset error code so a single error doesn't prevent
+                       // other subtests from running
+  _errorMsg = "";
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+      }
+    }
+
+    delete platforms;
+  }
+
+  width_ = Dims[test % NUM_SIZES].s[0];
+  height_ = Dims[test % NUM_SIZES].s[1];
+
+  bufSizeDW_ = width_ * height_;
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  device = devices[0];
+
+  free(devices);
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_image_format format = {CL_RGBA, CL_UNSIGNED_INT8};
+  inBuffer_ = _wrapper->clCreateImage2D(context_, CL_MEM_READ_ONLY, &format,
+                                        width_, height_, 0, NULL, &error_);
+  CHECK_RESULT(inBuffer_ == 0, "clCreateImage(inBuffer) failed");
+
+  outBuffer_ =
+      _wrapper->clCreateBuffer(context_, 0, sizeof(cl_uint), NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  genShader();
+  char *tmp = (char *)shader_.c_str();
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&tmp, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  std::string args;
+  args.clear();
+
+  error_ =
+      _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "MemWalker", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel(MemWalker) failed");
+
+  kernel2_ = _wrapper->clCreateKernel(program_, "Overhead", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel(Overhead) failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&inBuffer_);
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_);
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint),
+                                    (void *)&bufSizeDW_);
+  error_ = _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_uint),
+                                    (void *)&bufSizeDW_);
+  cl_uint4 zero;
+  zero.s[0] = 0;
+  zero.s[1] = 0;
+  zero.s[2] = 0;
+  zero.s[3] = 0;
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_uint4), (void *)&zero);
+  int bMem = 1;
+  error_ = _wrapper->clSetKernelArg(kernel_, 5, sizeof(cl_int), (void *)&bMem);
+  repeats_ = std::max((maxSize_ >> 2) / bufSizeDW_, 1u);
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 6, sizeof(cl_uint), (void *)&repeats_);
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel2_, 0, sizeof(cl_mem), (void *)&inBuffer_);
+  error_ = _wrapper->clSetKernelArg(kernel2_, 1, sizeof(cl_mem),
+                                    (void *)&outBuffer_);
+  error_ = _wrapper->clSetKernelArg(kernel2_, 2, sizeof(cl_uint),
+                                    (void *)&bufSizeDW_);
+  error_ = _wrapper->clSetKernelArg(kernel2_, 3, sizeof(cl_uint),
+                                    (void *)&bufSizeDW_);
+  error_ =
+      _wrapper->clSetKernelArg(kernel2_, 4, sizeof(cl_uint4), (void *)&zero);
+  error_ = _wrapper->clSetKernelArg(kernel2_, 5, sizeof(cl_int), (void *)&bMem);
+  error_ =
+      _wrapper->clSetKernelArg(kernel2_, 6, sizeof(cl_uint), (void *)&repeats_);
+
+  setData(inBuffer_, (int)1.0f);
+}
+
+void OCLPerfTextureMemLatency::run(void) {
+  int global = 1;
+  int local = 1;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  // Warm-up
+  unsigned int warmup = 128;
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint), (void *)&warmup);
+  error_ = _wrapper->clEnqueueNDRangeKernel(
+      cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+      (const size_t *)local_work_size, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint),
+                                    (void *)&bufSizeDW_);
+  _wrapper->clFinish(cmd_queue_);
+
+  CPerfCounter timer, timer2;
+
+  timer.Reset();
+  timer.Start();
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(
+      cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+      (const size_t *)local_work_size, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+
+  checkData(outBuffer_);
+
+  timer2.Reset();
+  timer2.Start();
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(
+      cmd_queue_, kernel2_, 1, NULL, (const size_t *)global_work_size,
+      (const size_t *)local_work_size, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+
+  _wrapper->clFinish(cmd_queue_);
+
+  timer2.Stop();
+  double sec = timer.GetElapsedTime() - timer2.GetElapsedTime();
+
+  // Read latency in ns
+  double perf = sec * (double)(1e09) / ((double)bufSizeDW_ * (double)repeats_);
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), "%8d reads, %5d repeats (ns)", bufSizeDW_,
+           repeats_);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfTextureMemLatency::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+
+  if (inBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(inBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(inBuffer_) failed");
+  }
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (kernel2_) {
+    error_ = _wrapper->clReleaseKernel(kernel2_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfTextureMemLatency.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfTextureMemLatency.h
new file mode 100644
index 0000000000..31a1197286
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfTextureMemLatency.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_TEXTUREMEMLATENCY_H_
+#define _OCL_TEXTUREMEMLATENCY_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfTextureMemLatency : public OCLTestImp {
+ public:
+  OCLPerfTextureMemLatency();
+  virtual ~OCLPerfTextureMemLatency();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void genShader(void);
+  void setData(cl_mem buffer, unsigned int data);
+  void checkData(cl_mem buffer);
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_kernel kernel2_;
+  cl_mem inBuffer_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int width_;
+  unsigned int height_;
+  size_t image_row_pitch;
+  size_t image_slice_pitch;
+  unsigned int bufSizeDW_;
+  unsigned int repeats_;
+  unsigned int maxSize_;
+};
+
+#endif  // _OCL_TEXTUREMEMLATENCY_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeed.cpp
new file mode 100644
index 0000000000..2837dafa81
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeed.cpp
@@ -0,0 +1,630 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfUAVReadSpeed.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+static const unsigned int NUM_SIZES = 4;
+static const unsigned int NUM_READ_MODES = 6;
+// Limit to 32 reads for now
+static const unsigned int MAX_READ_MODES = 4;
+
+static const unsigned int NumReads[NUM_READ_MODES] = {1, 4, 16, 32, 64, 128};
+// 256KB, 1 MB, 4MB, 16 MB
+static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304,
+                                              16777216};
+static const unsigned int MaxTypes = 6;
+static unsigned int NumTypes = MaxTypes;
+static const char *types[MaxTypes] = {"char", "short", "int",
+                                      "long", "float", "double"};
+static unsigned int StartType = 0;
+static const unsigned int NumVecWidths = 5;
+static const char *vecWidths[NumVecWidths] = {"", "2", "4", "8", "16"};
+static const unsigned int TypeSize[MaxTypes] = {
+    sizeof(cl_char), sizeof(cl_short), sizeof(cl_int),
+    sizeof(cl_long), sizeof(cl_float), sizeof(cl_double)};
+#define CHAR_BUF_SIZE 512
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+void OCLPerfUAVReadSpeed::genShader(unsigned int type, unsigned int vecWidth,
+                                    unsigned int numReads) {
+  char buf[CHAR_BUF_SIZE];
+
+  shader_.clear();
+  shader_ +=
+      "#ifdef USE_ARENA\n"
+      "#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\n"
+      "#endif\n";
+  shader_ +=
+      "#ifdef USE_AMD_DOUBLES\n"
+      "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
+      "#endif\n";
+  shader_ +=
+      "#ifdef USE_KHR_DOUBLES\n"
+      "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+      "#endif\n";
+  SNPRINTF(buf, CHAR_BUF_SIZE,
+           "__kernel void __attribute__((reqd_work_group_size(64,1,1))) "
+           "_uavReadSpeed(__global %s%s * restrict inBuf, __global %s%s * "
+           "restrict outBuf, constant uint * restrict constBuf)\n",
+           types[type], vecWidths[vecWidth], types[type], vecWidths[vecWidth]);
+  shader_.append(buf);
+  shader_ +=
+      "{\n"
+      "    uint i = (uint) get_global_id(0);\n";
+  if (numReads == 1) {
+    SNPRINTF(buf, CHAR_BUF_SIZE, "    %s%s temp = 0;\n", types[type],
+             vecWidths[vecWidth]);
+    shader_.append(buf);
+    shader_ +=
+        "    const unsigned int Max = constBuf[0];\n"
+        "    temp = *(inBuf + i % Max);\n";
+    shader_ +=
+        "    *(outBuf + i) = temp;\n"
+        "}\n";
+  } else {
+    SNPRINTF(buf, CHAR_BUF_SIZE, "    %s%s temp0 = 0;\n", types[type],
+             vecWidths[vecWidth]);
+    shader_.append(buf);
+    SNPRINTF(buf, CHAR_BUF_SIZE, "    %s%s temp1 = 0;\n", types[type],
+             vecWidths[vecWidth]);
+    shader_.append(buf);
+    SNPRINTF(buf, CHAR_BUF_SIZE, "    %s%s temp2 = 0;\n", types[type],
+             vecWidths[vecWidth]);
+    shader_.append(buf);
+    SNPRINTF(buf, CHAR_BUF_SIZE, "    %s%s temp3 = 0;\n", types[type],
+             vecWidths[vecWidth]);
+    shader_.append(buf);
+    shader_ +=
+        "    const unsigned int Max = constBuf[0];\n"
+        "    unsigned int idx0 = (i % Max) + constBuf[1];\n"
+        "    unsigned int idx1 = (i % Max) + constBuf[2];\n"
+        "    unsigned int idx2 = (i % Max) + constBuf[3];\n"
+        "    unsigned int idx3 = (i % Max) + constBuf[4];\n";
+
+    for (unsigned int i = 0; i < (numReads >> 2); i++) {
+      shader_ += "    temp0 += *(inBuf + idx0);\n";
+      shader_ += "    temp1 += *(inBuf + idx1);\n";
+      shader_ += "    temp2 += *(inBuf + idx2);\n";
+      shader_ += "    temp3 += *(inBuf + idx3);\n";
+      shader_ += "    idx0 += constBuf[5];\n";
+      shader_ += "    idx1 += constBuf[5];\n";
+      shader_ += "    idx2 += constBuf[5];\n";
+      shader_ += "    idx3 += constBuf[5];\n";
+    }
+    shader_ +=
+        "    *(outBuf + i) = temp0 + temp1 + temp2 + temp3;\n"
+        "}\n";
+  }
+  // printf("shader:\n%s\n", shader_.c_str());
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+OCLPerfUAVReadSpeed::OCLPerfUAVReadSpeed() {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  context_ = 0;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+    // Get last for default
+    platform = platforms[numPlatforms - 1];
+    for (unsigned i = 0; i < numPlatforms; ++i) {
+      char pbuf[100];
+      error_ = _wrapper->clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR,
+                                           sizeof(pbuf), pbuf, NULL);
+      num_devices = 0;
+      /* Get the number of requested devices */
+      error_ =
+          _wrapper->clGetDeviceIDs(platforms[i], type_, 0, NULL, &num_devices);
+      // Runtime returns an error when no GPU devices are present instead of
+      // just returning 0 devices
+      // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+      // Choose platform with GPU devices
+      if (num_devices > 0) {
+        platform = platforms[i];
+        break;
+      }
+    }
+    delete platforms;
+  }
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  char *p = strstr(charbuf, "cl_khr_byte_addressable_store");
+  char *p2 = strstr(charbuf, "cl_khr_fp64");
+  char *p3 = strstr(charbuf, "cl_amd_fp64");
+
+  NumTypes = MaxTypes;
+  if (!p) {
+    // No arena ops
+    NumTypes -= 2;
+    StartType = 2;
+  }
+  if (!p2 && !p3) {
+    // Doubles not supported
+    NumTypes--;
+  }
+  _numSubTests = NumTypes * NumVecWidths * NUM_SIZES * MAX_READ_MODES * 2;
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+}
+
+OCLPerfUAVReadSpeed::~OCLPerfUAVReadSpeed() {}
+
+// Fill with 1s of appropriate type
+void OCLPerfUAVReadSpeed::setData(cl_mem buffer, float val) {
+  void *ptr =
+      _wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_WRITE, 0,
+                                   bufSize_, 0, NULL, NULL, &error_);
+  switch (typeIdx_) {
+    case 0:  // char
+    {
+      char *data = (char *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(char)); i++)
+        data[i] = (char)val;
+      break;
+    }
+    case 1:  // short
+    {
+      short *data = (short *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(short)); i++)
+        data[i] = (short)val;
+      break;
+    }
+    case 2:  // int
+    {
+      int *data = (int *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(int)); i++)
+        data[i] = (int)val;
+      break;
+    }
+    case 3:  // long
+    {
+      cl_long *data = (cl_long *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(cl_long)); i++)
+        data[i] = (cl_long)val;
+      break;
+    }
+    case 4:  // float
+    {
+      float *data = (float *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(float)); i++)
+        data[i] = val;
+      break;
+    }
+    case 5:  // double
+    {
+      double *data = (double *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(double)); i++)
+        data[i] = (double)val;
+      break;
+    }
+    default:
+      // oops
+      break;
+  }
+  error_ =
+      _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL);
+}
+
+void OCLPerfUAVReadSpeed::checkData(cl_mem buffer) {
+  void *ptr =
+      _wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_READ, 0,
+                                   bufSize_, 0, NULL, NULL, &error_);
+  switch (typeIdx_) {
+    case 0:  // char
+    {
+      char *data = (char *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(char)); i++) {
+        if (data[i] != (char)numReads_) {
+          printf("Data validation failed at index %d!\n", i);
+          printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_,
+                 numReads_, numReads_, numReads_, (unsigned int)data[i],
+                 (unsigned int)data[i + 1], (unsigned int)data[i + 2],
+                 (unsigned int)data[i + 3]);
+          CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
+          break;
+        }
+      }
+      break;
+    }
+    case 1:  // short
+    {
+      short *data = (short *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(short)); i++) {
+        if (data[i] != (short)numReads_) {
+          printf("Data validation failed at index %d!\n", i);
+          printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_,
+                 numReads_, numReads_, numReads_, (unsigned int)data[i],
+                 (unsigned int)data[i + 1], (unsigned int)data[i + 2],
+                 (unsigned int)data[i + 3]);
+          CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
+          break;
+        }
+      }
+      break;
+    }
+    case 2:  // int
+    {
+      int *data = (int *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(int)); i++) {
+        if (data[i] != (int)numReads_) {
+          printf("Data validation failed at index %d!\n", i);
+          printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_,
+                 numReads_, numReads_, numReads_, (unsigned int)data[i],
+                 (unsigned int)data[i + 1], (unsigned int)data[i + 2],
+                 (unsigned int)data[i + 3]);
+          CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
+          break;
+        }
+      }
+      break;
+    }
+    case 3:  // long
+    {
+      cl_long *data = (cl_long *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(cl_long)); i++) {
+        if (data[i] != (cl_long)numReads_) {
+          printf("Data validation failed at index %d!\n", i);
+          printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_,
+                 numReads_, numReads_, numReads_, (unsigned int)data[i],
+                 (unsigned int)data[i + 1], (unsigned int)data[i + 2],
+                 (unsigned int)data[i + 3]);
+          CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
+          break;
+        }
+      }
+      break;
+    }
+    case 4:  // float
+    {
+      float *data = (float *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(float)); i++) {
+        if (data[i] != (float)numReads_) {
+          printf("Data validation failed at index %d!\n", i);
+          printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_,
+                 numReads_, numReads_, numReads_, (unsigned int)data[i],
+                 (unsigned int)data[i + 1], (unsigned int)data[i + 2],
+                 (unsigned int)data[i + 3]);
+          CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
+          break;
+        }
+      }
+      break;
+    }
+    case 5:  // double
+    {
+      double *data = (double *)ptr;
+      for (unsigned int i = 0; i < (bufSize_ / sizeof(double)); i++) {
+        if (data[i] != (double)numReads_) {
+          printf("Data validation failed at index %d!\n", i);
+          printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_,
+                 numReads_, numReads_, numReads_, (unsigned int)data[i],
+                 (unsigned int)data[i + 1], (unsigned int)data[i + 2],
+                 (unsigned int)data[i + 3]);
+          CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
+          break;
+        }
+      }
+      break;
+    }
+    default:
+      // oops
+      break;
+  }
+  error_ =
+      _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL);
+}
+
+void OCLPerfUAVReadSpeed::open(unsigned int test, char *units,
+                               double &conversion, unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  inBuffer_ = 0;
+  outBuffer_ = 0;
+  constBuffer_ = 0;
+  isAMD = false;
+  _errorFlag = false;  // Reset error code so a single error doesn't prevent
+                       // other subtests from running
+  _errorMsg = "";
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        isAMD = true;
+      }
+      // platform = platforms[_platformIndex];
+      // break;
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+
+  numReads_ = NumReads[test % MAX_READ_MODES];
+  width_ = Sizes[(test / MAX_READ_MODES) % NUM_SIZES];
+  vecSizeIdx_ = (test / (MAX_READ_MODES * NUM_SIZES)) % NumVecWidths;
+  typeIdx_ = (test / (MAX_READ_MODES * NUM_SIZES * NumVecWidths)) % NumTypes +
+             StartType;
+  cached_ = (test >= (MAX_READ_MODES * NUM_SIZES * NumTypes * NumVecWidths));
+
+  bufSize_ = width_;
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  device = devices[0];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  inBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+  CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed");
+
+  outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  constBuffer_ = _wrapper->clCreateBuffer(context_, 0, 16 * 2, NULL, &error_);
+  CHECK_RESULT(constBuffer_ == 0, "clCreateBuffer(constBuffer) failed");
+
+  genShader(typeIdx_, vecSizeIdx_, numReads_);
+  char *tmp = (char *)shader_.c_str();
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&tmp, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  std::string args;
+  args.clear();
+  if (cached_ && isAMD) {
+    args = "-fno-alias ";
+  }
+  if (typeIdx_ < 2) {
+    args += "-D USE_ARENA ";
+  }
+
+  if (typeIdx_ == 5) {
+    if (isAMD) {
+      args += "-D USE_AMD_DOUBLES ";
+    } else {
+      args += "-D USE_KHR_DOUBLES ";
+    }
+  }
+#if 0
+    // This setting can dramatically boost the long16 perf results by avoiding spilling.
+    if (isAMD)
+        args += "-Wb,-pre-RA-sched=list-tdrr";
+#endif
+
+  error_ =
+      _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "_uavReadSpeed", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&inBuffer_);
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_);
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_mem),
+                                    (void *)&constBuffer_);
+
+  setData(inBuffer_, 1.0f);
+  setData(outBuffer_, 1.2345678f);
+  unsigned int *cBuf = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, constBuffer_, true, CL_MAP_WRITE, 0, 16 * 2, 0, NULL, NULL,
+      &error_);
+  // Force all wavefronts to fetch the same data.  We are looking for peak speed
+  // here.
+  cBuf[0] = 64;
+  // These values are chosen to assure there is no data reuse within a clause.
+  // If caching is not working, then the uncached numbers will be low.
+  cBuf[1] = 0;
+  cBuf[2] = 64;
+  cBuf[3] = 128;
+  cBuf[4] = 192;
+  cBuf[5] = 0;
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, constBuffer_, cBuf, 0,
+                                             NULL, NULL);
+  _wrapper->clFinish(cmd_queue_);
+}
+
+void OCLPerfUAVReadSpeed::run(void) {
+  int global = bufSize_ / (TypeSize[typeIdx_] * (1 << vecSizeIdx_));
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  CPerfCounter timer;
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < NUM_ITER; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Constant bandwidth in GB/s
+  double perf =
+      ((double)bufSize_ * numReads_ * NUM_ITER * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  char buf2[256];
+  SNPRINTF(buf, sizeof(buf), "%s%s", types[typeIdx_], vecWidths[vecSizeIdx_]);
+  SNPRINTF(buf2, sizeof(buf2), " %-8s (%8d) %2d reads: %-8s (GB/s) ", buf,
+           width_, numReads_, (cached_ ? "cached" : "uncached"));
+  testDescString = buf2;
+  checkData(outBuffer_);
+}
+
+unsigned int OCLPerfUAVReadSpeed::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+
+  if (inBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(inBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(inBuffer_) failed");
+  }
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (constBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(constBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(constBuffer_) failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeed.h
new file mode 100644
index 0000000000..b779e7d6e8
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeed.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_UAVReadSpeed_H_
+#define _OCL_UAVReadSpeed_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfUAVReadSpeed : public OCLTestImp {
+ public:
+  OCLPerfUAVReadSpeed();
+  virtual ~OCLPerfUAVReadSpeed();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void genShader(unsigned int type, unsigned int vecWidth,
+                 unsigned int numReads);
+  void setData(cl_mem buffer, float data);
+  void checkData(cl_mem buffer);
+
+  static const unsigned int NUM_ITER = 100;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_mem inBuffer_;
+  cl_mem outBuffer_;
+  cl_mem constBuffer_;
+  cl_int error_;
+
+  unsigned int width_;
+  unsigned int bufSize_;
+  unsigned int vecSizeIdx_;
+  unsigned int numReads_;
+  unsigned int typeIdx_;
+  bool cached_;
+  bool isAMD;
+};
+
+#endif  // _OCL_UAVReadSpeed_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeedHostMem.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeedHostMem.cpp
new file mode 100644
index 0000000000..24f736ac3c
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeedHostMem.cpp
@@ -0,0 +1,437 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfUAVReadSpeedHostMem.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+const unsigned int NUM_SIZES = 4;
+const unsigned int NUM_READ_MODES = 1;
+const unsigned int MAX_READ_MODES = 1;
+
+static const unsigned int NumReads[NUM_READ_MODES] = {1};
+// 256KB, 1 MB, 4MB, 16 MB and 64 MB
+static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304,
+                                              16777216};
+static const unsigned int MaxTypes = 2;
+static unsigned int NumTypes = MaxTypes;
+static const char *types[MaxTypes] = {"float", "double"};
+static const unsigned int TypeSize[MaxTypes] = {sizeof(cl_float),
+                                                sizeof(cl_double)};
+static const unsigned int NumVecWidths = 5;
+static const char *vecWidths[NumVecWidths] = {"", "2", "4", "8", "16"};
+#define CHAR_BUF_SIZE 512
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+void OCLPerfUAVReadSpeedHostMem::genShader(unsigned int type,
+                                           unsigned int vecWidth,
+                                           unsigned int numReads) {
+  char buf[CHAR_BUF_SIZE];
+
+  shader_.clear();
+  shader_ +=
+      "#ifdef USE_AMD_DOUBLES\n"
+      "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
+      "#endif\n";
+  shader_ +=
+      "#ifdef USE_KHR_DOUBLES\n"
+      "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+      "#endif\n";
+  SNPRINTF(buf, CHAR_BUF_SIZE,
+           "__kernel void _uavReadSpeedHostMem(__global %s%s *inBuf, __global "
+           "%s%s *outBuf, constant uint *constBuf)\n",
+           types[type], vecWidths[vecWidth], types[type], vecWidths[vecWidth]);
+  shader_.append(buf);
+  shader_ +=
+      "{\n"
+      "    int i = (int) get_global_id(0);\n";
+  SNPRINTF(buf, CHAR_BUF_SIZE, "    %s%s temp = 0;\n", types[type],
+           vecWidths[vecWidth]);
+  shader_.append(buf);
+  shader_ += "    temp = *(inBuf + i);\n";
+  if (vecWidth == 0) {
+    shader_ +=
+        "    if (temp < 0)\n"
+        "        *(outBuf + i) = temp;\n"
+        "}\n";
+  } else {
+    shader_ +=
+        "    if (temp.s0 < 0)\n"
+        "        *(outBuf + i) = temp;\n"
+        "}\n";
+  }
+  // printf("shader:\n%s\n", shader_.c_str());
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+OCLPerfUAVReadSpeedHostMem::OCLPerfUAVReadSpeedHostMem() {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  context_ = 0;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+    // Get last for default
+    platform = platforms[numPlatforms - 1];
+    for (unsigned i = 0; i < numPlatforms; ++i) {
+      char pbuf[100];
+      error_ = _wrapper->clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR,
+                                           sizeof(pbuf), pbuf, NULL);
+      num_devices = 0;
+      /* Get the number of requested devices */
+      error_ =
+          _wrapper->clGetDeviceIDs(platforms[i], type_, 0, NULL, &num_devices);
+      // Runtime returns an error when no GPU devices are present instead of
+      // just returning 0 devices
+      // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+      // Choose platform with GPU devices
+      if (num_devices > 0) {
+        platform = platforms[i];
+        break;
+      }
+    }
+    delete platforms;
+  }
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  char *p = strstr(charbuf, "cl_khr_fp64");
+  char *p2 = strstr(charbuf, "cl_amd_fp64");
+
+  NumTypes = MaxTypes;
+
+  if (!p && !p2) {
+    // Doubles not supported
+    NumTypes--;
+  }
+  _numSubTests = NumTypes * NumVecWidths * NUM_SIZES * MAX_READ_MODES;
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+}
+
+OCLPerfUAVReadSpeedHostMem::~OCLPerfUAVReadSpeedHostMem() {}
+
+void OCLPerfUAVReadSpeedHostMem::setData(cl_mem buffer, float val) {
+  float *data = (float *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true,
+                                                      CL_MAP_WRITE, 0, bufSize_,
+                                                      0, NULL, NULL, &error_);
+  for (unsigned int i = 0; i < (bufSize_ >> 2); i++) data[i] = val;
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+}
+
+void OCLPerfUAVReadSpeedHostMem::checkData(cl_mem buffer) {
+  float *data = (float *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true,
+                                                      CL_MAP_READ, 0, bufSize_,
+                                                      0, NULL, NULL, &error_);
+  for (unsigned int i = 0; i < (bufSize_ >> 2); i++) {
+    if (data[i] != (float)numReads_) {
+      printf("Data validation failed at index %d!\n", i);
+      printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, numReads_,
+             numReads_, numReads_, (unsigned int)data[i],
+             (unsigned int)data[i + 1], (unsigned int)data[i + 2],
+             (unsigned int)data[i + 3]);
+      CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
+      break;
+    }
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+}
+
+void OCLPerfUAVReadSpeedHostMem::open(unsigned int test, char *units,
+                                      double &conversion,
+                                      unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  inBuffer_ = 0;
+  outBuffer_ = 0;
+  constBuffer_ = 0;
+  isAMD = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        isAMD = true;
+      }
+      // platform = platforms[_platformIndex];
+      // break;
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+
+  numReads_ = NumReads[test % MAX_READ_MODES];
+  width_ = Sizes[(test / MAX_READ_MODES) % NUM_SIZES];
+  vecSizeIdx_ = (test / (MAX_READ_MODES * NUM_SIZES)) % NumVecWidths;
+  typeIdx_ = (test / (MAX_READ_MODES * NUM_SIZES * NumVecWidths)) % NumTypes;
+  cached_ = true;
+
+  bufSize_ = width_;
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  device = devices[0];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  inBuffer_ = _wrapper->clCreateBuffer(context_,
+                                       CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                                       bufSize_, NULL, &error_);
+  CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed");
+
+  outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  constBuffer_ = _wrapper->clCreateBuffer(context_, 0, 16 * 2, NULL, &error_);
+  CHECK_RESULT(constBuffer_ == 0, "clCreateBuffer(constBuffer) failed");
+
+  genShader(typeIdx_, vecSizeIdx_, numReads_);
+  char *tmp = (char *)shader_.c_str();
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&tmp, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  std::string args;
+  args.clear();
+  if (cached_ && isAMD) {
+    args = "-fno-alias ";
+  }
+  if (typeIdx_ == 1) {
+    if (isAMD) {
+      args += "-D USE_AMD_DOUBLES ";
+    } else {
+      args += "-D USE_KHR_DOUBLES ";
+    }
+  }
+  error_ =
+      _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL);
+
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "_uavReadSpeedHostMem", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&inBuffer_);
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_);
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_mem),
+                                    (void *)&constBuffer_);
+
+  setData(inBuffer_, 0.0f);
+  setData(outBuffer_, 1.2345678f);
+  unsigned int *cBuf = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, constBuffer_, true, CL_MAP_WRITE, 0, 16 * 2, 0, NULL, NULL,
+      &error_);
+  cBuf[0] = bufSize_ / (TypeSize[typeIdx_] * (1 << vecSizeIdx_));
+  cBuf[1] = 0;
+  cBuf[2] = 1024;
+  cBuf[3] = 2048;
+  cBuf[4] = 3072;
+  cBuf[5] = 0;
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, constBuffer_, cBuf, 0,
+                                             NULL, NULL);
+  _wrapper->clFinish(cmd_queue_);
+}
+
+void OCLPerfUAVReadSpeedHostMem::run(void) {
+  int global = bufSize_ / (TypeSize[typeIdx_] * (1 << vecSizeIdx_));
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  CPerfCounter timer;
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < NUM_ITER; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Constant bandwidth in GB/s
+  double perf =
+      ((double)bufSize_ * numReads_ * NUM_ITER * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  char buf2[256];
+  SNPRINTF(buf, sizeof(buf), "%s%s", types[typeIdx_], vecWidths[vecSizeIdx_]);
+  SNPRINTF(buf2, sizeof(buf2), " %-8s (%8d) (GB/s) ", buf, width_);
+  testDescString = buf2;
+  // Test doesn't write anything
+  // checkData(outBuffer_);
+}
+
+unsigned int OCLPerfUAVReadSpeedHostMem::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+
+  if (inBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(inBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(inBuffer_) failed");
+  }
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (constBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(constBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(constBuffer_) failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeedHostMem.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeedHostMem.h
new file mode 100644
index 0000000000..20f2393313
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeedHostMem.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_UAVReadSpeedHostMem_H_
+#define _OCL_UAVReadSpeedHostMem_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfUAVReadSpeedHostMem : public OCLTestImp {
+ public:
+  OCLPerfUAVReadSpeedHostMem();
+  virtual ~OCLPerfUAVReadSpeedHostMem();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void genShader(unsigned int type, unsigned int vecWidth,
+                 unsigned int numReads);
+  void setData(cl_mem buffer, float data);
+  void checkData(cl_mem buffer);
+
+  static const unsigned int NUM_ITER = 100;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_mem inBuffer_;
+  cl_mem outBuffer_;
+  cl_mem constBuffer_;
+  cl_int error_;
+
+  unsigned int width_;
+  unsigned int bufSize_;
+  unsigned int vecSizeIdx_;
+  unsigned int numReads_;
+  unsigned int typeIdx_;
+  bool isAMD;
+  bool cached_;
+};
+
+#endif  // _OCL_UAVReadSpeedHostMem_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVWriteSpeedHostMem.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVWriteSpeedHostMem.cpp
new file mode 100644
index 0000000000..446b0c3c44
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVWriteSpeedHostMem.cpp
@@ -0,0 +1,380 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfUAVWriteSpeedHostMem.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+const unsigned int NUM_SIZES = 4;
+
+// 256KB, 1 MB, 4MB, 16 MB and 64 MB
+static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304,
+                                              16777216};
+static const unsigned int MaxTypes = 2;
+static unsigned int NumTypes = 2;
+static const char *types[MaxTypes] = {"float", "double"};
+static const unsigned int TypeSize[MaxTypes] = {sizeof(cl_float),
+                                                sizeof(cl_double)};
+static const unsigned int NumVecWidths = 5;
+static const char *vecWidths[NumVecWidths] = {"", "2", "4", "8", "16"};
+#define CHAR_BUF_SIZE 512
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+void OCLPerfUAVWriteSpeedHostMem::genShader(unsigned int type,
+                                            unsigned int vecWidth) {
+  char buf[CHAR_BUF_SIZE];
+
+  shader_.clear();
+  shader_ +=
+      "#ifdef USE_AMD_DOUBLES\n"
+      "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
+      "#endif\n";
+  shader_ +=
+      "#ifdef USE_KHR_DOUBLES\n"
+      "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+      "#endif\n";
+  SNPRINTF(buf, CHAR_BUF_SIZE,
+           "__kernel void _uavWriteSpeedHostMem(__global %s%s *outBuf)\n",
+           types[type], vecWidths[vecWidth]);
+  shader_.append(buf);
+  shader_ +=
+      "{\n"
+      "    int i = (int) get_global_id(0);\n"
+      "    *(outBuf + i) = 0;\n"
+      "}\n";
+  // printf("shader:\n%s\n", shader_.c_str());
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+OCLPerfUAVWriteSpeedHostMem::OCLPerfUAVWriteSpeedHostMem() {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  context_ = 0;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+    // Get last for default
+    platform = platforms[numPlatforms - 1];
+    for (unsigned i = 0; i < numPlatforms; ++i) {
+      char pbuf[100];
+      error_ = _wrapper->clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR,
+                                           sizeof(pbuf), pbuf, NULL);
+      num_devices = 0;
+      /* Get the number of requested devices */
+      error_ =
+          _wrapper->clGetDeviceIDs(platforms[i], type_, 0, NULL, &num_devices);
+      // Runtime returns an error when no GPU devices are present instead of
+      // just returning 0 devices
+      // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+      // Choose platform with GPU devices
+      if (num_devices > 0) {
+        platform = platforms[i];
+        break;
+      }
+    }
+    delete platforms;
+  }
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  char *p = strstr(charbuf, "cl_khr_fp64");
+  char *p2 = strstr(charbuf, "cl_amd_fp64");
+
+  NumTypes = MaxTypes;
+
+  if (!p && !p2) {
+    // Doubles not supported
+    NumTypes--;
+  }
+  _numSubTests = NumTypes * NumVecWidths * NUM_SIZES;
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+}
+
+OCLPerfUAVWriteSpeedHostMem::~OCLPerfUAVWriteSpeedHostMem() {}
+
+void OCLPerfUAVWriteSpeedHostMem::setData(cl_mem buffer, float val) {
+  float *data = (float *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true,
+                                                      CL_MAP_WRITE, 0, bufSize_,
+                                                      0, NULL, NULL, &error_);
+  for (unsigned int i = 0; i < (bufSize_ >> 2); i++) data[i] = val;
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+  _wrapper->clFinish(cmd_queue_);
+}
+
+void OCLPerfUAVWriteSpeedHostMem::checkData(cl_mem buffer) {
+  float *data = (float *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true,
+                                                      CL_MAP_READ, 0, bufSize_,
+                                                      0, NULL, NULL, &error_);
+  for (unsigned int i = 0; i < (bufSize_ >> 2); i++) {
+    if (data[i] != 0.0f) {
+      printf("Data validation failed at index %d!\n", i);
+      printf("Expected %lf %lf %lf %lf\nGot %d %d %d %d\n", 0.0f, 0.0f, 0.0f,
+             0.0f, (unsigned int)data[i], (unsigned int)data[i + 1],
+             (unsigned int)data[i + 2], (unsigned int)data[i + 3]);
+      CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
+      break;
+    }
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+  _wrapper->clFinish(cmd_queue_);
+}
+
+void OCLPerfUAVWriteSpeedHostMem::open(unsigned int test, char *units,
+                                       double &conversion,
+                                       unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  outBuffer_ = 0;
+  isAMD = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        isAMD = true;
+      }
+      // platform = platforms[_platformIndex];
+      // break;
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+
+  width_ = Sizes[test % NUM_SIZES];
+  vecSizeIdx_ = (test / NUM_SIZES) % NumVecWidths;
+  typeIdx_ = (test / (NUM_SIZES * NumVecWidths)) % NumTypes;
+
+  bufSize_ = width_;
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  device = devices[0];
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  outBuffer_ = _wrapper->clCreateBuffer(
+      context_, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, bufSize_, NULL,
+      &error_);
+  CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
+
+  genShader(typeIdx_, vecSizeIdx_);
+  char *tmp = (char *)shader_.c_str();
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&tmp, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  std::string args;
+  args.clear();
+  if (typeIdx_ == 1) {
+    if (isAMD) {
+      args += "-D USE_AMD_DOUBLES ";
+    } else {
+      args += "-D USE_KHR_DOUBLES ";
+    }
+  }
+  error_ =
+      _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ =
+      _wrapper->clCreateKernel(program_, "_uavWriteSpeedHostMem", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer_);
+
+  setData(outBuffer_, 1.2345678f);
+}
+
+void OCLPerfUAVWriteSpeedHostMem::run(void) {
+  int global = bufSize_ / (TypeSize[typeIdx_] * (1 << vecSizeIdx_));
+  int local = 64;
+
+  size_t global_work_size[1] = {(size_t)global};
+  size_t local_work_size[1] = {(size_t)local};
+
+  CPerfCounter timer;
+
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < NUM_ITER; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+  }
+  _wrapper->clFinish(cmd_queue_);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  // Constant bandwidth in GB/s
+  double perf = ((double)bufSize_ * NUM_ITER * (double)(1e-09)) / sec;
+
+  _perfInfo = (float)perf;
+  char buf[256];
+  char buf2[256];
+  SNPRINTF(buf, sizeof(buf), "%s%s", types[typeIdx_], vecWidths[vecSizeIdx_]);
+  SNPRINTF(buf2, sizeof(buf2), " %-8s (%8d) (GB/s) ", buf, width_);
+  testDescString = buf2;
+
+  // Test just writes 0s
+  checkData(outBuffer_);
+}
+
+unsigned int OCLPerfUAVWriteSpeedHostMem::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+
+  if (outBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(outBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(outBuffer_) failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVWriteSpeedHostMem.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVWriteSpeedHostMem.h
new file mode 100644
index 0000000000..646f74ed0f
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVWriteSpeedHostMem.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_UAVWriteSpeedHostMem_H_
+#define _OCL_UAVWriteSpeedHostMem_H_
+
+#include "OCLTestImp.h"
+
+class OCLPerfUAVWriteSpeedHostMem : public OCLTestImp {
+ public:
+  OCLPerfUAVWriteSpeedHostMem();
+  virtual ~OCLPerfUAVWriteSpeedHostMem();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  std::string shader_;
+  void genShader(unsigned int type, unsigned int vecWidth);
+  void setData(cl_mem buffer, float data);
+  void checkData(cl_mem buffer);
+
+  static const unsigned int NUM_ITER = 100;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_mem outBuffer_;
+  cl_int error_;
+
+  unsigned int width_;
+  unsigned int bufSize_;
+  unsigned int vecSizeIdx_;
+  unsigned int typeIdx_;
+  bool isAMD;
+};
+
+#endif  // _OCL_UAVWriteSpeedHostMem_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUncoalescedRead.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUncoalescedRead.cpp
new file mode 100644
index 0000000000..b9add8e915
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUncoalescedRead.cpp
@@ -0,0 +1,270 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfUncoalescedRead.h"
+
+#include <string.h>
+
+#include <iomanip>
+#include <sstream>
+
+#include "Timer.h"
+
+const char* OCLPerfUncoalescedRead::kernel_str =
+    "#define NUM_READS 32\n\
+    __kernel void read_uncoalescing(__global float *input,__global float *output)\n\
+    {\n\
+        float val = (float)(0.0f);\n\
+        size_t gid = get_global_id(0);\n\
+        val = val + input[gid * NUM_READS + 0];\n\
+        val = val + input[gid * NUM_READS + 1];\n\
+        val = val + input[gid * NUM_READS + 2];\n\
+        val = val + input[gid * NUM_READS + 3];\n\
+        val = val + input[gid * NUM_READS + 4];\n\
+        val = val + input[gid * NUM_READS + 5];\n\
+        val = val + input[gid * NUM_READS + 6];\n\
+        val = val + input[gid * NUM_READS + 7];\n\
+        val = val + input[gid * NUM_READS + 8];\n\
+        val = val + input[gid * NUM_READS + 9];\n\
+        val = val + input[gid * NUM_READS + 10];\n\
+        val = val + input[gid * NUM_READS + 11];\n\
+        val = val + input[gid * NUM_READS + 12];\n\
+        val = val + input[gid * NUM_READS + 13];\n\
+        val = val + input[gid * NUM_READS + 14];\n\
+        val = val + input[gid * NUM_READS + 15];\n\
+        val = val + input[gid * NUM_READS + 16];\n\
+        val = val + input[gid * NUM_READS + 17];\n\
+        val = val + input[gid * NUM_READS + 18];\n\
+        val = val + input[gid * NUM_READS + 19];\n\
+        val = val + input[gid * NUM_READS + 20];\n\
+        val = val + input[gid * NUM_READS + 21];\n\
+        val = val + input[gid * NUM_READS + 22];\n\
+        val = val + input[gid * NUM_READS + 23];\n\
+        val = val + input[gid * NUM_READS + 24];\n\
+        val = val + input[gid * NUM_READS + 25];\n\
+        val = val + input[gid * NUM_READS + 26];\n\
+        val = val + input[gid * NUM_READS + 27];\n\
+        val = val + input[gid * NUM_READS + 28];\n\
+        val = val + input[gid * NUM_READS + 29];\n\
+        val = val + input[gid * NUM_READS + 30];\n\
+        val = val + input[gid * NUM_READS + 31];\n\
+        output[gid] = val;\n\
+    }\n";
+
+OCLPerfUncoalescedRead::OCLPerfUncoalescedRead() { _numSubTests = 3; }
+
+OCLPerfUncoalescedRead::~OCLPerfUncoalescedRead() {}
+
+void OCLPerfUncoalescedRead::open(unsigned int test, char* units,
+                                  double& conversion, unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "error_ opening test");
+  silentFailure = false;
+  _openTest = test;
+  program_ = 0;
+  kernel_ = 0;
+  input_buff = NULL;
+
+  if (test > 0) {
+    size_t param_size = 0;
+    char* strVersion = 0;
+    error_ = _wrapper->clGetDeviceInfo(
+        devices_[_deviceId], CL_DEVICE_OPENCL_C_VERSION, 0, 0, &param_size);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+    strVersion = (char*)malloc(param_size);
+    error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId],
+                                       CL_DEVICE_OPENCL_C_VERSION, param_size,
+                                       strVersion, 0);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+    if (strVersion[9] < '2') {
+      printf("\nOpenCL C 2.0 not supported\n");
+      silentFailure = true;
+    }
+    free(strVersion);
+    if (silentFailure) return;
+  }
+
+  cl_mem buffer =
+      _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY,
+                               SIZE * NUM_READS * sizeof(cl_float), 0, &error_);
+  buffers_.push_back(buffer);
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
+                                    SIZE * sizeof(cl_float), 0, &error_);
+  buffers_.push_back(buffer);
+
+  srand(0x8956);
+  input_buff = (float*)malloc(SIZE * NUM_READS * sizeof(float));
+  for (unsigned int i = 0; i < SIZE * NUM_READS; ++i) {
+    input_buff[i] = (float)rand();
+  }
+
+  error_ = _wrapper->clEnqueueWriteBuffer(
+      cmdQueues_[_deviceId], buffers_[0], CL_TRUE, 0,
+      SIZE * NUM_READS * sizeof(cl_float), input_buff, 0, 0, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed");
+
+  float* buff = (float*)_wrapper->clEnqueueMapBuffer(
+      cmdQueues_[_deviceId], buffers_[1], CL_TRUE, CL_MAP_WRITE, 0,
+      SIZE * sizeof(cl_float), 0, 0, 0, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueMapBuffer failed");
+  memset(buff, 0, SIZE * sizeof(cl_float));
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], buffers_[1],
+                                             buff, 0, 0, 0);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueMapBuffer failed");
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed");
+  std::string compileOptions = "";
+  if (test > 0) {
+    compileOptions = "-cl-std=CL2.0";
+  }
+  if (test > 1) {
+    compileOptions += " -fsc-use-buffer-for-hsa-global ";
+  }
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId],
+                                    compileOptions.c_str(), NULL, NULL);
+
+  if (error_ != CL_SUCCESS) {
+    char log[400];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 400, log, 0);
+    printf("\n\n%s\n\n", log);
+  }
+
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed");
+  kernel_ = _wrapper->clCreateKernel(program_, "read_uncoalescing", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel failed");
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*)&buffers_[0]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed");
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void*)&buffers_[1]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed");
+}
+
+void OCLPerfUncoalescedRead::validate(void) {
+  bool success = true;
+  float* buff = (float*)_wrapper->clEnqueueMapBuffer(
+      cmdQueues_[_deviceId], buffers_[1], CL_TRUE, CL_MAP_READ, 0,
+      SIZE * sizeof(cl_float), 0, 0, 0, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueMapBuffer failed");
+  for (unsigned int i = 0; i < SIZE; ++i) {
+    volatile float val = 0;
+    for (int j = 0; j < NUM_READS; ++j) {
+      val += input_buff[i * NUM_READS + j];
+    }
+    if (val != buff[i]) {
+      success = false;
+      std::string errorMsg = "Invalid result.  Expected: ";
+      errorMsg += std::to_string(val);
+      errorMsg += " Actual result: ";
+      errorMsg += std::to_string(buff[i]);
+      CHECK_RESULT(true, errorMsg.c_str());
+      break;
+    }
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], buffers_[1],
+                                             buff, 0, 0, 0);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueMapBuffer failed");
+}
+
+void OCLPerfUncoalescedRead::run(void) {
+  if (silentFailure) {
+    return;
+  }
+  CPerfCounter timer;
+
+  // Warm up
+  size_t workGroupSize = SIZE;
+  for (int i = 0; i < 50; ++i) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, &workGroupSize, NULL, 0,
+                                              NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel");
+    _wrapper->clFinish(cmdQueues_[_deviceId]);
+  }
+
+  cl_event eventArr[NUM_ITER];
+  timer.Reset();
+  timer.Start();
+  for (unsigned int i = 0; i < NUM_ITER; i++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, &workGroupSize, NULL, 0,
+                                              NULL, &eventArr[i]);
+
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel");
+  }
+  error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
+  CHECK_RESULT(error_, "clFinish failed");
+  timer.Stop();
+  double sec1 = timer.GetElapsedTime();
+  double sec2 = 0;
+  for (unsigned int i = 0; i < NUM_ITER; ++i) {
+    cl_ulong startTime = 0, endTime = 0;
+    error_ = _wrapper->clGetEventProfilingInfo(eventArr[i],
+                                               CL_PROFILING_COMMAND_START,
+                                               sizeof(cl_ulong), &startTime, 0);
+    CHECK_RESULT(error_, "clGetEventProfilingInfo failed");
+    error_ = _wrapper->clGetEventProfilingInfo(
+        eventArr[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0);
+    CHECK_RESULT(error_, "clGetEventProfilingInfo failed");
+    sec2 += 1e-9 * (endTime - startTime);
+    error_ = _wrapper->clReleaseEvent(eventArr[i]);
+    CHECK_RESULT(error_, "clReleaseEvent failed");
+  }
+
+  validate();
+
+  // Buffer copy bandwidth in GB/s
+  double perf1 = ((double)SIZE * NUM_READS * NUM_ITER * sizeof(cl_float) *
+                  (double)(1e-09)) /
+                 sec1;
+  double perf2 = ((double)SIZE * NUM_READS * NUM_ITER * sizeof(cl_float) *
+                  (double)(1e-09)) /
+                 sec2;
+  _perfInfo = (float)perf2;
+
+  std::ostringstream strStream;
+  switch (_openTest) {
+    case 0:
+      strStream << "OCL1.2      ";
+      break;
+    case 1:
+      strStream << "OCL2.0      ";
+      break;
+    case 2:
+      strStream << "OCL2.0/flag ";
+      break;
+  }
+
+  strStream << std::fixed << std::setprecision(2) << perf1 << " timer GB/s ";
+  strStream << "time: " << std::setprecision(3) << sec1 << "s (profile GB/s)";
+  testDescString = strStream.str();
+  ;
+}
+
+unsigned int OCLPerfUncoalescedRead::close(void) {
+  if (input_buff) {
+    free(input_buff);
+  }
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUncoalescedRead.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUncoalescedRead.h
new file mode 100644
index 0000000000..b9e1ffde1d
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUncoalescedRead.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_UncoalescedRead_H_
+#define _OCL_UncoalescedRead_H_
+
+#include "OCLTestImp.h"
+#define NUM_READS 32
+class OCLPerfUncoalescedRead : public OCLTestImp {
+ public:
+  OCLPerfUncoalescedRead();
+  virtual ~OCLPerfUncoalescedRead();
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  static const unsigned int NUM_ITER = 1000;
+  static const unsigned int SIZE = 250000;
+  static const char* kernel_str;
+  bool silentFailure;
+  float* input_buff;
+  void validate(void);
+};
+
+#endif  // _OCL_UncoalescedRead_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfVerticalFetch.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfVerticalFetch.cpp
new file mode 100644
index 0000000000..41d17ad7f5
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfVerticalFetch.cpp
@@ -0,0 +1,353 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfVerticalFetch.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <fstream>
+#include <sstream>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 1
+#define WIDTH 4952
+#define HEIGHT 3288
+unsigned int Sizes[NUM_SIZES] = {WIDTH * HEIGHT * 4};
+
+#define KERNEL_CODE(...) #__VA_ARGS__
+const static char* strKernel = KERNEL_CODE(
+\n __kernel void ResizeVerticalFilter(
+    const __global uint* inputImage, const unsigned int inputColumns,
+    const unsigned int inputRows, __local uint* inputImageCache,
+    const int numCachedPixels, __global uint* dst) {
+  const unsigned int startY = get_group_id(1) * get_local_size(1);
+  float scale = 0.5f;
+  const float support = 0.5f;
+  const int cacheRangeStartY =
+      max((int)((startY + 0.5f) / 1.0f + support + 0.5f), (int)(0));
+  const int cacheRangeEndY =
+      min((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
+  const unsigned int x = get_global_id(0);
+  event_t e = async_work_group_strided_copy(
+      inputImageCache, inputImage + cacheRangeStartY * inputColumns + x,
+      cacheRangeEndY - cacheRangeStartY, inputColumns, 0);
+  wait_group_events(1, &e);
+
+  if (get_local_id(1) == 0) {
+    //    uint sum = 0;
+    //    for (unsigned int chunk = 0; chunk < numCachedPixels; chunk++) {
+    //      sum += inputImageCache[chunk];
+    //    }
+    atomic_add(dst, inputImageCache[0]);
+  }
+}
+\n);
+
+OCLPerfVerticalFetch::OCLPerfVerticalFetch() {
+  ptr_ = nullptr;
+  _numSubTests = 6;
+}
+
+OCLPerfVerticalFetch::~OCLPerfVerticalFetch() {}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLPerfVerticalFetch::open(unsigned int test, char* units,
+                                double& conversion, unsigned int deviceId) {
+  error_ = CL_SUCCESS;
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  program_ = 0;
+  kernel_ = 0;
+  skip_ = false;
+  dstBuffer_ = 0;
+  cl_ulong loopCnt = nBytes / (16 * sizeof(cl_uint));
+  cl_uint maxCUs;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId],
+                                     CL_DEVICE_MAX_COMPUTE_UNITS,
+                                     sizeof(cl_uint), &maxCUs, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  wgs = 64;
+  const static cl_uint wavesPerCU = 8;
+  nWorkItems = maxCUs * wavesPerCU * wgs;
+  uint32_t memLoc = CL_MEM_USE_HOST_PTR;
+
+  inputData = 0x1;
+  switch (test) {
+    case 0:
+      nIter = 1;
+      mem_type_ = "UHP";
+      break;
+    case 1:
+      nIter = 100;
+      mem_type_ = "UHP";
+      break;
+    case 2:
+      nIter = 1;
+      memLoc = CL_MEM_ALLOC_HOST_PTR;
+      mem_type_ = "AHP";
+      break;
+    case 3:
+      nIter = 100;
+      memLoc = CL_MEM_ALLOC_HOST_PTR;
+      mem_type_ = "AHP";
+      break;
+    case 4:
+      nIter = 1;
+      memLoc = 0;
+      mem_type_ = "dev";
+      break;
+    case 5:
+      nIter = 1000;
+      memLoc = 0;
+      mem_type_ = "dev";
+      break;
+  }
+
+  std::string nameFile("dim.ini");
+  std::fstream is(nameFile.c_str(), std::fstream::in | std::fstream::binary);
+  std::string line;
+  if (is.is_open()) {
+    size_t posStart = 0;
+    do {
+      std::getline(is, line);
+    } while (line.find_first_of('/', posStart) != std::string::npos);
+    // Find global/local
+    posStart = 0;
+    size_t posEnd = 1;
+    std::string dimS = line.substr(posStart, posEnd - posStart);
+    dim = std::stoi(dimS.c_str(), nullptr, 10);
+    posStart = posEnd;
+    posEnd = line.find_first_of('[', posStart);
+    for (cl_uint i = 0; i < dim; ++i) {
+      posStart = posEnd + 1;
+      posEnd = line.find_first_of(',', posStart);
+      std::string global = line.substr(posStart, posEnd - posStart);
+      gws[i] = std::stoi(global.c_str(), nullptr, 10);
+    }
+    posEnd = line.find_first_of('[', posStart);
+    for (cl_uint i = 0; i < dim; ++i) {
+      posStart = posEnd + 1;
+      posEnd = line.find_first_of(',', posStart);
+      std::string global = line.substr(posStart, posEnd - posStart);
+      lws[i] = std::stoi(global.c_str(), nullptr, 10);
+    }
+    posEnd = line.find_first_of('[', posStart);
+    posStart = posEnd + 1;
+    posEnd = line.find_first_of(',', posStart);
+    std::string global = line.substr(posStart, posEnd - posStart);
+    numCachedPixels_ = std::stoi(global.c_str(), nullptr, 10);
+    is.close();
+  } else {
+    dim = 2;
+    gws[0] = WIDTH;
+    gws[1] = 512;
+    lws[0] = 1;
+    lws[1] = 256;
+    numCachedPixels_ = 1676;
+  }
+  cl_uint width = static_cast<cl_uint>(gws[0]);
+  cl_uint height = numCachedPixels_ * static_cast<cl_uint>(gws[1] / lws[1]);
+  if (gws[1] > 512) {
+    gws[1] = 512;
+  }
+  Sizes[0] = width * height * sizeof(int);
+  nBytes = Sizes[0];
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "ResizeVerticalFilter", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  if (memLoc == CL_MEM_USE_HOST_PTR) {
+    ptr_ = malloc(nBytes);
+  }
+  srcBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY | memLoc,
+                                        nBytes, ptr_, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer(srcBuffer) failed");
+  void* mem;
+  mem = _wrapper->clEnqueueMapBuffer(cmdQueues_[_deviceId], srcBuffer_, CL_TRUE,
+                                     CL_MAP_READ | CL_MAP_WRITE, 0, nBytes, 0,
+                                     NULL, NULL, &error_);
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  for (unsigned int i = 0; i < nBytes / sizeof(cl_uint); ++i) {
+    reinterpret_cast<cl_uint*>(mem)[i] = inputData;
+  }
+
+  dstBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
+                                        sizeof(cl_uint), NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer(dstBuffer) failed");
+  _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], srcBuffer_, mem, 0,
+                                    NULL, NULL);
+  mem = _wrapper->clEnqueueMapBuffer(cmdQueues_[_deviceId], dstBuffer_, CL_TRUE,
+                                     CL_MAP_READ | CL_MAP_WRITE, 0,
+                                     sizeof(cl_uint), 0, NULL, NULL, &error_);
+  CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+  memset(mem, 0, sizeof(cl_uint));
+  _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], dstBuffer_, mem, 0,
+                                    NULL, NULL);
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &srcBuffer_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_uint), (void*)&width);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint), (void*)&height);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 3,
+                                    numCachedPixels_ * sizeof(cl_uint), 0);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_uint),
+                                    (void*)&numCachedPixels_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 5, sizeof(cl_mem), (void*)&dstBuffer_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+}
+
+void OCLPerfVerticalFetch::run(void) {
+  if (skip_) {
+    return;
+  }
+
+  CPerfCounter timer;
+
+  // warm up
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, dim,
+                                            NULL, gws, lws, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  cl_uint* memResult;
+  memResult = (cl_uint*)malloc(sizeof(cl_uint));
+  if (0 == memResult) {
+    CHECK_RESULT_NO_RETURN(0, "malloc failed!\n");
+    return;
+  }
+
+  memset(memResult, 0, sizeof(cl_uint));
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], dstBuffer_,
+                                         CL_FALSE, 0, sizeof(cl_uint),
+                                         memResult, 0, NULL, NULL);
+
+  CHECK_RESULT(error_, "clEnqueueReadBuffer dstBuffer_ failed!");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  if (memResult[0] != ((gws[0] * gws[1]) / (lws[0] * lws[1]))) {
+    CHECK_RESULT_NO_RETURN(0, "Data validation failed for warm up run!\n");
+    // free(memResult);
+    // return;
+  }
+
+  free(memResult);
+
+  timer.Reset();
+  timer.Start();
+  double sec2 = 0;
+  cl_event* events = new cl_event[nIter];
+  for (unsigned int i = 0; i < nIter; i++) {
+    error_ =
+        _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, dim,
+                                         NULL, gws, lws, 0, NULL, &events[i]);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+    _wrapper->clFinish(cmdQueues_[_deviceId]);
+  }
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  timer.Stop();
+  for (unsigned int i = 0; i < nIter; i++) {
+    cl_ulong startTime = 0, endTime = 0;
+    error_ = _wrapper->clGetEventProfilingInfo(
+        events[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0);
+    CHECK_RESULT(error_, "clGetEventProfilingInfo failed");
+    error_ = _wrapper->clGetEventProfilingInfo(
+        events[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0);
+    CHECK_RESULT(error_, "clGetEventProfilingInfo failed");
+
+    _wrapper->clReleaseEvent(events[i]);
+    sec2 += endTime - startTime;
+  }
+  double sec = timer.GetElapsedTime();
+  delete[] events;
+
+  // read speed in GB/s
+  double perf = ((double)nBytes * nIter * (double)(1e-09)) / sec;
+  double perf2 = ((double)nBytes * nIter) / sec2;
+  _perfInfo = (float)perf2;
+  float perfInfo = (float)perf;
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf),
+           " (%8d bytes, %s) i:%4d Wall time Perf: %.2f (GB/s)", nBytes,
+           mem_type_, nIter, perfInfo);
+  testDescString = buf;
+}
+
+unsigned int OCLPerfVerticalFetch::close(void) {
+  if (!skip_) {
+    if (srcBuffer_) {
+      error_ = _wrapper->clReleaseMemObject(srcBuffer_);
+      CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                             "clReleaseMemObject(srcBuffer_) failed");
+    }
+
+    if (dstBuffer_) {
+      error_ = _wrapper->clReleaseMemObject(dstBuffer_);
+      CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                             "clReleaseMemObject(srcBuffer_) failed");
+    }
+  }
+  if (ptr_ != nullptr) {
+    free(ptr_);
+    ptr_ = nullptr;
+  }
+
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfVerticalFetch.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfVerticalFetch.h
new file mode 100644
index 0000000000..d94e1eb22f
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfVerticalFetch.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#pragma once
+#include "OCLTestImp.h"
+
+class OCLPerfVerticalFetch : public OCLTestImp {
+ public:
+  OCLPerfVerticalFetch();
+  virtual ~OCLPerfVerticalFetch();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  cl_mem srcBuffer_;
+  cl_mem dstBuffer_;
+  unsigned int nWorkItems;  // number of GPU work items
+  unsigned int wgs;         // work group size
+  unsigned int nBytes;      // input and output buffer size
+  unsigned int nIter;       // overall number of timing loops
+  cl_uint inputData;        // input data to fill the input buffer
+  bool skip_;
+  void* ptr_;
+  const char* mem_type_;
+  cl_uint dim;
+  size_t gws[3];
+  size_t lws[3];
+  cl_uint numCachedPixels_;
+};
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/TestList.cpp b/projects/clr/opencl/tests/ocltst/module/perf/TestList.cpp
new file mode 100644
index 0000000000..343471a45d
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/TestList.cpp
@@ -0,0 +1,191 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLTestListImp.h"
+
+//
+// Includes for tests
+//
+#include "OCLPerfAES256.h"
+#include "OCLPerfAtomicSpeed.h"
+#include "OCLPerfBufferCopyOverhead.h"
+#include "OCLPerfBufferCopySpeed.h"
+#include "OCLPerfBufferReadSpeed.h"
+#include "OCLPerfBufferWriteSpeed.h"
+#include "OCLPerfCPUMemSpeed.h"
+#include "OCLPerfCommandQueue.h"
+#include "OCLPerfConcurrency.h"
+#include "OCLPerfDevMemReadSpeed.h"
+#include "OCLPerfDevMemWriteSpeed.h"
+#include "OCLPerfDeviceConcurrency.h"
+#include "OCLPerfDeviceEnqueue.h"
+#include "OCLPerfDispatchSpeed.h"
+#include "OCLPerfDoubleDMA.h"
+#include "OCLPerfDoubleDMASeq.h"
+#include "OCLPerfFillBuffer.h"
+#include "OCLPerfFillImage.h"
+#include "OCLPerfFlush.h"
+#include "OCLPerfGenericBandwidth.h"
+#include "OCLPerfGenoilSiaMiner.h"
+#include "OCLPerfImageCopyCorners.h"
+#include "OCLPerfImageCopySpeed.h"
+#include "OCLPerfImageMapUnmap.h"
+#include "OCLPerfImageReadSpeed.h"
+#include "OCLPerfImageSampleRate.h"
+#include "OCLPerfImageWriteSpeed.h"
+#include "OCLPerfKernelArguments.h"
+#include "OCLPerfLDSLatency.h"
+#include "OCLPerfLDSReadSpeed.h"
+#include "OCLPerfMandelbrot.h"
+#include "OCLPerfMapBufferReadSpeed.h"
+#include "OCLPerfMapBufferWriteSpeed.h"
+#include "OCLPerfMapImageReadSpeed.h"
+#include "OCLPerfMapImageWriteSpeed.h"
+#include "OCLPerfMatrixTranspose.h"
+#include "OCLPerfMemCombine.h"
+#include "OCLPerfMemCreate.h"
+#include "OCLPerfMemLatency.h"
+#include "OCLPerfPinnedBufferReadSpeed.h"
+#include "OCLPerfPinnedBufferWriteSpeed.h"
+#include "OCLPerfPipeCopySpeed.h"
+#include "OCLPerfSHA256.h"
+#include "OCLPerfSampleRate.h"
+#include "OCLPerfScalarReplArrayElem.h"
+#include "OCLPerfSdiP2PCopy.h"
+#include "OCLPerfSepia.h"
+#include "OCLPerfTextureMemLatency.h"
+#include "OCLPerfUAVReadSpeed.h"
+#include "OCLPerfUAVReadSpeedHostMem.h"
+#include "OCLPerfUAVWriteSpeedHostMem.h"
+#include "OCLPerfVerticalFetch.h"
+// 2.0
+#include "OCLPerf3DImageWriteSpeed.h"
+#include "OCLPerfAtomicSpeed20.h"
+#include "OCLPerfDeviceEnqueue2.h"
+#include "OCLPerfDeviceEnqueueEvent.h"
+#include "OCLPerfDeviceEnqueueSier.h"
+#include "OCLPerfImageCreate.h"
+#include "OCLPerfImageReadWrite.h"
+#include "OCLPerfImageReadsRGBA.h"
+#include "OCLPerfProgramGlobalRead.h"
+#include "OCLPerfProgramGlobalWrite.h"
+#include "OCLPerfSVMAlloc.h"
+#include "OCLPerfSVMKernelArguments.h"
+#include "OCLPerfSVMMap.h"
+#include "OCLPerfSVMMemFill.h"
+#include "OCLPerfSVMMemcpy.h"
+#include "OCLPerfSVMSampleRate.h"
+#include "OCLPerfUncoalescedRead.h"
+
+//
+//  Helper macro for adding tests
+//
+template <typename T>
+static void* dictionary_CreateTestFunc(void) {
+  return new T();
+}
+
+#define TEST(name) \
+  { #name, &dictionary_CreateTestFunc < name> }
+
+TestEntry TestList[] = {
+    TEST(OCLPerfUAVReadSpeed),
+    TEST(OCLPerfUAVReadSpeedHostMem),
+    TEST(OCLPerfUAVWriteSpeedHostMem),
+    TEST(OCLPerfLDSReadSpeed),
+    TEST(OCLPerfDispatchSpeed),
+    TEST(OCLPerfMapBufferReadSpeed),
+    TEST(OCLPerfMapBufferWriteSpeed),
+    TEST(OCLPerfBufferReadSpeed),
+    TEST(OCLPerfBufferReadRectSpeed),
+    TEST(OCLPerfPinnedBufferReadSpeed),
+    TEST(OCLPerfPinnedBufferReadRectSpeed),
+    TEST(OCLPerfBufferWriteSpeed),
+    TEST(OCLPerfBufferWriteRectSpeed),
+    TEST(OCLPerfPinnedBufferWriteSpeed),
+    TEST(OCLPerfPinnedBufferWriteRectSpeed),
+    TEST(OCLPerfBufferCopySpeed),
+    TEST(OCLPerfBufferCopyRectSpeed),
+    TEST(OCLPerfMapImageReadSpeed),
+    TEST(OCLPerfMapImageWriteSpeed),
+    TEST(OCLPerfMemCombine),
+    TEST(OCLPerfImageReadSpeed),
+    TEST(OCLPerfPinnedImageReadSpeed),
+    TEST(OCLPerfImageWriteSpeed),
+    TEST(OCLPerfPinnedImageWriteSpeed),
+    TEST(OCLPerfImageCopySpeed),
+    TEST(OCLPerfCPUMemSpeed),
+    TEST(OCLPerfMandelbrot),
+    TEST(OCLPerfAsyncMandelbrot),
+    TEST(OCLPerfConcurrency),
+    TEST(OCLPerfDeviceConcurrency),
+    TEST(OCLPerfAES256),
+    TEST(OCLPerfSHA256),
+    TEST(OCLPerfAtomicSpeed),
+    TEST(OCLPerfMatrixTranspose),
+    TEST(OCLPerfImageCopyCorners),
+    TEST(OCLPerfScalarReplArrayElem),
+    TEST(OCLPerfSdiP2PCopy),
+    TEST(OCLPerfSepia),
+    TEST(OCLPerfFlush),
+    TEST(OCLPerfMemCreate),
+    TEST(OCLPerfImageMapUnmap),
+    TEST(OCLPerfCommandQueue),
+    TEST(OCLPerfKernelArguments),
+    TEST(OCLPerfDoubleDMA),
+    TEST(OCLPerfDoubleDMASeq),
+    TEST(OCLPerfMemLatency),
+    TEST(OCLPerfTextureMemLatency),
+    TEST(OCLPerfSampleRate),
+    TEST(OCLPerfImageSampleRate),
+    TEST(OCLPerfBufferCopyOverhead),
+    TEST(OCLPerfMapDispatchSpeed),
+    TEST(OCLPerfDeviceEnqueue),
+    TEST(OCLPerfPipeCopySpeed),
+    TEST(OCLPerfGenericBandwidth),
+    TEST(OCLPerfLDSLatency),
+    TEST(OCLPerfDeviceEnqueue2),
+    TEST(OCLPerfSVMAlloc),
+    TEST(OCLPerfSVMMap),
+    TEST(OCLPerfDeviceEnqueueEvent),
+    TEST(OCLPerfSVMKernelArguments),
+    TEST(OCLPerfDeviceEnqueueSier),
+    TEST(OCLPerfProgramGlobalRead),
+    TEST(OCLPerfProgramGlobalWrite),
+    TEST(OCLPerfAtomicSpeed20),
+    TEST(OCLPerfSVMSampleRate),
+    TEST(OCLPerfImageCreate),
+    TEST(OCLPerfImageReadsRGBA),
+    TEST(OCLPerf3DImageWriteSpeed),
+    TEST(OCLPerfImageReadWrite),
+    TEST(OCLPerfSVMMemcpy),
+    TEST(OCLPerfSVMMemFill),
+    TEST(OCLPerfFillBuffer),
+    TEST(OCLPerfFillImage),
+    TEST(OCLPerfUncoalescedRead),
+    TEST(OCLPerfGenoilSiaMiner),
+    TEST(OCLPerfDevMemReadSpeed),
+    TEST(OCLPerfDevMemWriteSpeed),
+    TEST(OCLPerfVerticalFetch),
+};
+
+unsigned int TestListCount = sizeof(TestList) / sizeof(TestList[0]);
+unsigned int TestLibVersion = 0;
+const char* TestLibName = "oclperf";
diff --git a/projects/clr/opencl/tests/ocltst/module/perf/oclperf.exclude b/projects/clr/opencl/tests/ocltst/module/perf/oclperf.exclude
new file mode 100644
index 0000000000..5004785a63
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/perf/oclperf.exclude
@@ -0,0 +1,28 @@
+# We don't need to run regressions on these tests, they are purely for performance testing and debugging
+OCLPerfMemLatency
+OCLPerfTextureMemLatency
+OCLPerfSampleRate
+OCLPerfImageSampleRate
+OCLPerfBufferCopyOverhead
+OCLPerfDeviceEnqueue
+OCLPerfPipeCopySpeed
+OCLPerfGenericBandwidth
+OCLPerfLDSLatency
+OCLPerfFillBuffer
+OCLPerfDeviceEnqueue2
+OCLPerfDeviceEnqueueEvent
+OCLPerfDeviceEnqueueSier
+OCLPerfSVMAlloc
+OCLPerfSVMMap
+OCLPerfSVMKernelArguments
+OCLPerfProgramGlobalRead
+OCLPerfProgramGlobalWrite
+OCLPerfAtomicSpeed20
+OCLPerfSVMSampleRate
+OCLPerfImageCreate
+OCLPerfImageReadsRGBA
+OCLPerf3DImageWriteSpeed
+OCLPerfImageReadWrite
+OCLPerfSVMMemcpy
+OCLPerfSVMMemFill
+OCLPerfFillImage
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncMap.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncMap.cpp
new file mode 100644
index 0000000000..7e5df567ab
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncMap.cpp
@@ -0,0 +1,98 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLAsyncMap.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+static const size_t BufSize = 0x800000;
+static const size_t MapRegion = 0x100000;
+static const unsigned int NumMaps = BufSize / MapRegion;
+
+OCLAsyncMap::OCLAsyncMap() { _numSubTests = 1; }
+
+OCLAsyncMap::~OCLAsyncMap() {}
+
+void OCLAsyncMap::open(unsigned int test, char* units, double& conversion,
+                       unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  cl_mem buffer;
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                    BufSize * sizeof(cl_uint), NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLAsyncMap::run(void) {
+  cl_uint* values[NumMaps];
+  cl_mem mapBuffer = buffers()[0];
+  size_t offset = 0;
+  size_t region = MapRegion * sizeof(cl_uint);
+
+  for (unsigned int i = 0; i < NumMaps; ++i) {
+    values[i] = reinterpret_cast<cl_uint*>(_wrapper->clEnqueueMapBuffer(
+        cmdQueues_[_deviceId], mapBuffer, CL_TRUE, (CL_MAP_READ | CL_MAP_WRITE),
+        offset, region, 0, NULL, NULL, &error_));
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueMapBuffer() failed");
+    offset += region;
+  }
+
+  for (unsigned int i = 0; i < NumMaps; ++i) {
+    for (unsigned int j = 0; j < MapRegion; ++j) {
+      values[i][j] = i;
+    }
+  }
+
+  for (unsigned int i = 0; i < NumMaps; ++i) {
+    error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], mapBuffer,
+                                               values[i], 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueMapBuffer() failed");
+  }
+
+  values[0] = reinterpret_cast<cl_uint*>(_wrapper->clEnqueueMapBuffer(
+      cmdQueues_[_deviceId], mapBuffer, CL_TRUE, CL_MAP_READ, 0,
+      BufSize * sizeof(cl_uint), 0, NULL, NULL, &error_));
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueMapBuffer() failed");
+
+  for (unsigned int i = 0; i < NumMaps; ++i) {
+    values[i] = values[0] + i * MapRegion;
+    for (unsigned int j = 0; j < MapRegion; ++j) {
+      CHECK_RESULT((values[i][j] != i), "validation failed");
+    }
+  }
+
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], mapBuffer,
+                                             values[0], 0, NULL, NULL);
+
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+}
+
+unsigned int OCLAsyncMap::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncMap.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncMap.h
new file mode 100644
index 0000000000..93cb3f52a3
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncMap.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_ASYNC_MAP_H_
+#define _OCL_ASYNC_MAP_H_
+
+#include "OCLTestImp.h"
+
+class OCLAsyncMap : public OCLTestImp {
+ public:
+  OCLAsyncMap();
+  virtual ~OCLAsyncMap();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+};
+
+#endif  // _OCL_ASYNC_MAP_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncTransfer.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncTransfer.cpp
new file mode 100644
index 0000000000..15df346b6e
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncTransfer.cpp
@@ -0,0 +1,139 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLAsyncTransfer.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+static const size_t Iterations = 0x100;
+static const size_t IterationDivider = 2;
+static const size_t MaxBuffers = IterationDivider;
+static const size_t BufSize = 0x800000;
+
+const static char* strKernel =
+    "__kernel void factorial(__global uint* out)                        \n"
+    "{                                                                  \n"
+    "   uint id = get_global_id(0);                                     \n"
+    "   uint factorial = 1;                                             \n"
+    "   for (uint i = 1; i < (id / 0x10000); ++i)                       \n"
+    "   {                                                               \n"
+    "       factorial *= i;                                             \n"
+    "   }                                                               \n"
+    "	out[id] = factorial;                                            \n"
+    "}                                                                  \n";
+
+OCLAsyncTransfer::OCLAsyncTransfer() { _numSubTests = 1; }
+
+OCLAsyncTransfer::~OCLAsyncTransfer() {}
+
+void OCLAsyncTransfer::open(unsigned int test, char* units, double& conversion,
+                            unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "factorial", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+  for (size_t i = 0; i < MaxBuffers; ++i) {
+    buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                      BufSize * sizeof(cl_uint), NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+    buffers_.push_back(buffer);
+  }
+
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR,
+                                    BufSize * sizeof(cl_uint), NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLAsyncTransfer::run(void) {
+  void* values;
+  CPerfCounter timer;
+  cl_mem mapBuffer = buffers()[MaxBuffers];
+
+  values = _wrapper->clEnqueueMapBuffer(
+      cmdQueues_[_deviceId], mapBuffer, true, (CL_MAP_READ | CL_MAP_WRITE), 0,
+      BufSize * sizeof(cl_uint), 0, NULL, NULL, &error_);
+
+  timer.Reset();
+  timer.Start();
+  size_t x;
+  for (x = 0; x < Iterations / IterationDivider; x++) {
+    for (size_t y = 0; y < IterationDivider; ++y) {
+      cl_mem buffer = buffers()[y];
+
+      error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+      size_t gws[1] = {BufSize};
+      error_ = _wrapper->clEnqueueNDRangeKernel(
+          cmdQueues_[_deviceId], kernel_, 1, NULL, gws, NULL, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+    }
+
+    cl_mem readBuffer = buffers()[0];
+    error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], readBuffer,
+                                           false, 0, BufSize * sizeof(cl_uint),
+                                           values, 0, NULL, NULL);
+    _wrapper->clFlush(cmdQueues_[_deviceId]);
+
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed");
+  }
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  timer.Stop();
+
+  double sec = timer.GetElapsedTime();
+  // Buffer read bandwidth in GB/s
+  double perf = ((double)BufSize * sizeof(cl_uint) * x * (double)(1e-09)) / sec;
+
+  printf(" Time: %.2f sec, BW: %.2f GB/s   ", sec, perf);
+
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], mapBuffer,
+                                             values, 0, NULL, NULL);
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+}
+
+unsigned int OCLAsyncTransfer::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncTransfer.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncTransfer.h
new file mode 100644
index 0000000000..96303e7de2
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncTransfer.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_ASYNC_TRANSFER_H_
+#define _OCL_ASYNC_TRANSFER_H_
+
+#include "OCLTestImp.h"
+
+class OCLAsyncTransfer : public OCLTestImp {
+ public:
+  OCLAsyncTransfer();
+  virtual ~OCLAsyncTransfer();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+};
+
+#endif  // _OCL_ASYNC_TRANSFER_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLAtomicCounter.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAtomicCounter.cpp
new file mode 100644
index 0000000000..083cb45ed6
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAtomicCounter.cpp
@@ -0,0 +1,168 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLAtomicCounter.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+const static unsigned int MaxCounters = 2;
+const static char* strKernel =
+    "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable            \n"
+    "__kernel void atomic_test(                                             \n"
+    "   counter32_t counter0, counter32_t counter1, global uint* out_val)   \n"
+    "{                                                                      \n"
+    "   if (!get_global_id(0)) {                                            \n"
+    "       uint val0 = atomic_inc(counter0);                               \n"
+    "       uint val1 = atomic_dec(counter1);                               \n"
+    "       out_val[0] = val0;                                              \n"
+    "       out_val[1] = val1;                                              \n"
+    "   }                                                                   \n"
+    "}                                                                      \n";
+
+OCLAtomicCounter::OCLAtomicCounter() {
+  _numSubTests = 1;
+  failed_ = false;
+}
+
+OCLAtomicCounter::~OCLAtomicCounter() {}
+
+void OCLAtomicCounter::open(unsigned int test, char* units, double& conversion,
+                            unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening");
+
+  char name[1024] = {0};
+  size_t size = 0;
+
+  if (deviceId >= deviceCount_) {
+    failed_ = true;
+    return;
+  }
+
+  _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 1024,
+                            name, &size);
+  if (!strstr(name, "cl_ext_atomic_counter")) {
+    printf("Atomic counter extension is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], "-legacy",
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "atomic_test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+  for (unsigned int i = 0; i < MaxCounters; ++i) {
+    buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                      sizeof(cl_uint), NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+    buffers_.push_back(buffer);
+  }
+
+  buffer =
+      _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                               MaxCounters * sizeof(cl_uint), NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLAtomicCounter::run(void) {
+  if (failed_) {
+    return;
+  }
+  cl_uint initVal[2] = {5, 10};
+  for (unsigned int i = 0; i < MaxCounters; ++i) {
+    error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], buffers()[i],
+                                            true, 0, sizeof(cl_uint),
+                                            &initVal[i], 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+  }
+
+  for (unsigned int i = 0; i < MaxCounters + 1; ++i) {
+    cl_mem buffer = buffers()[i];
+    error_ = _wrapper->clSetKernelArg(kernel_, i, sizeof(cl_mem), &buffer);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+  }
+
+  size_t gws[1] = {64};
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, NULL, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+  cl_uint outputV[MaxCounters] = {0};
+
+  // Find the new counter value
+  initVal[0]++;
+  initVal[1]--;
+
+  for (unsigned int i = 0; i < MaxCounters; ++i) {
+    cl_mem buffer = buffers()[i];
+    error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers()[i],
+                                           true, 0, sizeof(cl_uint),
+                                           &outputV[i], 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed");
+    if (initVal[i] != outputV[i]) {
+      printf("%d != %d", initVal[i], outputV[i]);
+      CHECK_RESULT(true, " - Incorrect result for counter!\n");
+    }
+  }
+
+  // Restore the original value to check the returned result in the kernel
+  initVal[0]--;
+  initVal[1]++;
+
+  cl_mem buffer = buffers()[MaxCounters];
+  error_ = _wrapper->clEnqueueReadBuffer(
+      cmdQueues_[_deviceId], buffers()[MaxCounters], true, 0,
+      MaxCounters * sizeof(cl_uint), outputV, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed");
+  for (unsigned int i = 0; i < MaxCounters; ++i) {
+    if (initVal[i] != outputV[i]) {
+      printf("%d != %d", initVal[i], outputV[i]);
+      CHECK_RESULT(true,
+                   " - Incorrect result for counter inside kernel. Returned "
+                   "value != original.\n");
+    }
+  }
+}
+
+unsigned int OCLAtomicCounter::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLAtomicCounter.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAtomicCounter.h
new file mode 100644
index 0000000000..d4bdb1a453
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAtomicCounter.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_ATOMIC_COUNTER_H_
+#define _OCL_ATOMIC_COUNTER_H_
+
+#include "OCLTestImp.h"
+
+class OCLAtomicCounter : public OCLTestImp {
+ public:
+  OCLAtomicCounter();
+  virtual ~OCLAtomicCounter();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+};
+
+#endif  // _OCL_ATOMIC_COUNTER_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLBlitKernel.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLBlitKernel.cpp
new file mode 100644
index 0000000000..b8a07c0df4
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLBlitKernel.cpp
@@ -0,0 +1,612 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLBlitKernel.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+const static cl_uint Stages = 4;
+const static cl_uint ThreadsForCheck = 1 << Stages;
+
+#define KERNEL_CODE(...) #__VA_ARGS__
+
+const static char* strKernel = 
+    KERNEL_CODE(
+    \n
+    \x23 if OCL20
+    \n
+    extern void __amd_scheduler(__global void *, __global void *, uint);
+    \n
+    \x23 endif
+    \n
+    extern void __amd_copyBufferToImage(
+        __global uint*, __write_only image2d_array_t, ulong4,
+        int4, int4, uint4, ulong4);
+
+    extern void __amd_copyImageToBuffer(
+        __read_only image2d_array_t, __global uint*, __global ushort*,
+        __global uchar*, int4, ulong4, int4, uint4, ulong4);
+
+    extern void __amd_copyImage(
+        __read_only image2d_array_t, __write_only image2d_array_t,
+        int4, int4, int4);
+
+    extern void __amd_copyImage1DA(
+        __read_only image2d_array_t, __write_only image2d_array_t,
+        int4, int4, int4);
+
+    extern void __amd_copyBufferRect(
+        __global uchar*, __global uchar*,
+        ulong4, ulong4, ulong4);
+
+    extern void __amd_copyBufferRectAligned(
+        __global uint*, __global uint*,
+        ulong4, ulong4, ulong4);
+
+    extern void __amd_copyBuffer(
+        __global uchar*, __global uchar*,
+        ulong, ulong, ulong, uint);
+
+    extern void __amd_copyBufferAligned(
+        __global uint*, __global uint*,
+        ulong, ulong, ulong, uint);
+
+    extern void __amd_fillBuffer(
+        __global uchar*, __global uint*, __constant uchar*,
+        uint, ulong, ulong);
+
+    extern void __amd_fillImage(
+        __write_only image2d_array_t,
+        float4, int4, uint4, int4, int4, uint);
+
+    __kernel void copyBufferToImage(
+        __global uint* src,
+        __write_only image2d_array_t dst,
+        ulong4 srcOrigin,
+        int4 dstOrigin,
+        int4 size,
+        uint4 format,
+        ulong4 pitch)
+    {
+        __amd_copyBufferToImage(src, dst, srcOrigin, dstOrigin, size, format, pitch);
+    }
+
+    __kernel void copyImageToBuffer(
+        __read_only image2d_array_t src,
+        __global uint* dstUInt,
+        __global ushort* dstUShort,
+        __global uchar* dstUChar,
+        int4 srcOrigin,
+        ulong4 dstOrigin,
+        int4 size,
+        uint4 format,
+        ulong4 pitch)
+    {
+        __amd_copyImageToBuffer(src, dstUInt, dstUShort, dstUChar,
+                                  srcOrigin, dstOrigin, size, format, pitch);
+    }
+
+    __kernel void copyImage(
+        __read_only  image2d_array_t src,
+        __write_only image2d_array_t dst,
+        int4 srcOrigin,
+        int4 dstOrigin,
+        int4 size)
+    {
+        __amd_copyImage(src, dst, srcOrigin, dstOrigin, size);
+    }
+
+    __kernel void copyImage1DA(
+        __read_only image2d_array_t src,
+        __write_only image2d_array_t dst,
+        int4 srcOrigin,
+        int4 dstOrigin,
+        int4 size)
+    {
+        __amd_copyImage1DA(src, dst, srcOrigin, dstOrigin, size);
+    }
+
+    __kernel void copyBufferRect(
+        __global uchar* src,
+        __global uchar* dst,
+        ulong4 srcRect,
+        ulong4 dstRect,
+        ulong4 size)
+    {
+        __amd_copyBufferRect(src, dst, srcRect, dstRect, size);
+    }
+
+    __kernel void copyBufferRectAligned(
+        __global uint* src,
+        __global uint* dst,
+        ulong4 srcRect,
+        ulong4 dstRect,
+        ulong4 size)
+    {
+        __amd_copyBufferRectAligned(src, dst, srcRect, dstRect, size);
+    }
+
+    __kernel void copyBuffer(
+        __global uchar* srcI,
+        __global uchar* dstI,
+        ulong srcOrigin,
+        ulong dstOrigin,
+        ulong size,
+        uint remain)
+    {
+        __amd_copyBuffer(srcI, dstI, srcOrigin, dstOrigin, size, remain);
+    }
+
+    __kernel void copyBufferAligned(
+        __global uint* src,
+        __global uint* dst,
+        ulong srcOrigin,
+        ulong dstOrigin,
+        ulong size,
+        uint alignment)
+    {
+        __amd_copyBufferAligned(src, dst, srcOrigin, dstOrigin, size, alignment);
+    }
+
+    __kernel void fillBuffer(
+        __global uchar* bufUChar,
+        __global uint* bufUInt,
+        __constant uchar* pattern,
+        uint patternSize,
+        ulong offset,
+        ulong size)
+    {
+        __amd_fillBuffer(bufUChar, bufUInt, pattern, patternSize, offset, size);
+    }
+
+    __kernel void fillImage(
+        __write_only image2d_array_t image,
+        float4 patternFLOAT4,
+        int4 patternINT4,
+        uint4 patternUINT4,
+        int4 origin,
+        int4 size,
+        uint type)
+    {
+        __amd_fillImage(image, patternFLOAT4, patternINT4, patternUINT4,
+                          origin, size, type);
+    }
+    \n
+    \x23 if OCL20
+    \n
+    typedef struct _HsaAqlDispatchPacket {
+        uint    mix;
+        ushort  workgroup_size[3];
+        ushort  reserved2;
+        uint    grid_size[3];
+        uint    private_segment_size_bytes;
+        uint    group_segment_size_bytes;
+        ulong   kernel_object_address;
+        ulong   kernel_arg_address;
+        ulong   reserved3;
+        ulong   completion_signal;
+    } HsaAqlDispatchPacket;
+    \n
+    // This is an OpenCLized hsa_control_directives_t
+    typedef struct _AmdControlDirectives {
+        ulong   enabled_control_directives;
+        ushort  enable_break_exceptions;
+        ushort  enable_detect_exceptions;
+        uint    max_dynamic_group_size;
+        ulong   max_flat_grid_size;
+        uint    max_flat_workgroup_size;
+        uchar   required_dim;
+        uchar   reserved1[3];
+        ulong   required_grid_size[3];
+        uint    required_workgroup_size[3];
+        uchar   reserved2[60];
+    } AmdControlDirectives;
+    \n
+    // This is an OpenCLized amd_kernel_code_t
+    typedef struct _AmdKernelCode {
+        uint    amd_kernel_code_version_major;
+        uint    amd_kernel_code_version_minor;
+        ushort  amd_machine_kind;
+        ushort  amd_machine_version_major;
+        ushort  amd_machine_version_minor;
+        ushort  amd_machine_version_stepping;
+        long    kernel_code_entry_byte_offset;
+        long    kernel_code_prefetch_byte_offset;
+        ulong   kernel_code_prefetch_byte_size;
+        ulong   max_scratch_backing_memory_byte_size;
+        uint    compute_pgm_rsrc1;
+        uint    compute_pgm_rsrc2;
+        uint    kernel_code_properties;
+        uint    workitem_private_segment_byte_size;
+        uint    workgroup_group_segment_byte_size;
+        uint    gds_segment_byte_size;
+        ulong   kernarg_segment_byte_size;
+        uint    workgroup_fbarrier_count;
+        ushort  wavefront_sgpr_count;
+        ushort  workitem_vgpr_count;
+        ushort  reserved_vgpr_first;
+        ushort  reserved_vgpr_count;
+        ushort  reserved_sgpr_first;
+        ushort  reserved_sgpr_count;
+        ushort  debug_wavefront_private_segment_offset_sgpr;
+        ushort  debug_private_segment_buffer_sgpr;
+        uchar   kernarg_segment_alignment;
+        uchar   group_segment_alignment;
+        uchar   private_segment_alignment;
+        uchar   wavefront_size;
+        int     call_convention;
+        uchar   reserved1[12];
+        ulong   runtime_loader_kernel_symbol;
+        AmdControlDirectives control_directives;
+    } AmdKernelCode;
+    \n
+    typedef struct _HwDispatchHeader {
+        uint    writeData0;     // CP WRITE_DATA write to rewind for memory
+        uint    writeData1;
+        uint    writeData2;
+        uint    writeData3;
+        uint    rewind;         // REWIND execution
+        uint    startExe;       // valid bit
+        uint    condExe0;       // 0xC0032200 -- TYPE 3, COND_EXEC
+        uint    condExe1;       // 0x00000204 ----
+        uint    condExe2;       // 0x00000000 ----
+        uint    condExe3;       // 0x00000000 ----
+        uint    condExe4;       // 0x00000000 ----
+    } HwDispatchHeader;
+    \n
+    typedef struct _HwDispatch {
+        uint    packet0;        // 0xC0067602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (6 values)
+        uint    offset0;        // 0x00000204 ---- OFFSET
+        uint    startX;         // 0x00000000 ---- COMPUTE_START_X: START = 0x0
+        uint    startY;         // 0x00000000 ---- COMPUTE_START_Y: START = 0x0
+        uint    startZ;         // 0x00000000 ---- COMPUTE_START_Z: START = 0x0
+        uint    wrkGrpSizeX;    // 0x00000000 ---- COMPUTE_NUM_THREAD_X: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0
+        uint    wrkGrpSizeY;    // 0x00000000 ---- COMPUTE_NUM_THREAD_Y: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0
+        uint    wrkGrpSizeZ;    // 0x00000000 ---- COMPUTE_NUM_THREAD_Z: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0
+        uint    packet1;        // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
+        uint    offset1;        // 0x0000020C ---- OFFSET
+        uint    isaLo;          // 0x00000000 ---- COMPUTE_PGM_LO: DATA = 0x0
+        uint    isaHi;          // 0x00000000 ---- COMPUTE_PGM_HI: DATA = 0x0, INST_ATC__CI__VI = 0x0
+        uint    packet2;        // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
+        uint    offset2;        // 0x00000212 ---- OFFSET
+        uint    resource1;      // 0x00000000 ---- COMPUTE_PGM_RSRC1
+        uint    resource2;      // 0x00000000 ---- COMPUTE_PGM_RSRC2
+        uint    packet3;        // 0xc0017602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (1 value)
+        uint    offset3;        // 0x00000215 ---- OFFSET
+        uint    pad31;          // 0x000003ff ---- COMPUTE_RESOURCE_LIMITS
+        uint    packet31;       // 0xC0067602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (1 value)
+        uint    offset31;       // 0x00000218 ---- OFFSET
+        uint    ringSize;       // 0x00000000 ---- COMPUTE_TMPRING_SIZE: WAVES = 0x0, WAVESIZE = 0x0
+        uint    user0;          // 0xC0047602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (4 values)
+        uint    offsUser0;      // 0x00000240 ---- OFFSET
+        uint    scratchLo;      // 0x00000000 ---- COMPUTE_USER_DATA_0: DATA = 0x0
+        uint    scratchHi;      // 0x80000000 ---- COMPUTE_USER_DATA_1: DATA = 0x80000000
+        uint    scratchSize;    // 0x00000000 ---- COMPUTE_USER_DATA_2: DATA = 0x0
+        uint    padUser;        // 0x00EA7FAC ---- COMPUTE_USER_DATA_3: DATA = 0xEA7FAC
+        uint    user1;          // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
+        uint    offsUser1;      // 0x00000244 ---- OFFSET
+        uint    aqlPtrLo;       // 0x00000000 ---- COMPUTE_USER_DATA_4: DATA = 0x0
+        uint    aqlPtrHi;       // 0x00000000 ---- COMPUTE_USER_DATA_5: DATA = 0x0
+        uint    user2;          // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
+        uint    offsUser2;      // 0x00000246 ---- OFFSET
+        uint    hsaQueueLo;     // 0x00000000 ---- COMPUTE_USER_DATA_6: DATA = 0x0
+        uint    hsaQueueHi;     // 0x00000000 ---- COMPUTE_USER_DATA_7: DATA = 0x0
+        uint    user3;          // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
+        uint    offsUser3;      // 0x00000246 ---- OFFSET
+        uint    argsLo;         // 0x00000000 ---- COMPUTE_USER_DATA_8: DATA = 0x0
+        uint    argsHi;         // 0x00000000 ---- COMPUTE_USER_DATA_9: DATA = 0x0
+        uint    copyData;       // 0xC0044000 -- TYPE 3, COPY_DATA
+        uint    copyDataFlags;  // 0x00000405 ---- srcSel 0x5, destSel 0x4, countSel 0x0, wrConfirm 0x0, engineSel 0x0
+        uint    scratchAddrLo;  // 0x000201C4 ---- srcAddressLo
+        uint    scratchAddrHi;  // 0x00000000 ---- srcAddressHi
+        uint    shPrivateLo;    // 0x00002580 ---- dstAddressLo
+        uint    shPrivateHi;    // 0x00000000 ---- dstAddressHi
+        uint    user4;          // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values)
+        uint    offsUser4;      // 0x00000248 ---- OFFSET
+        uint    scratchOffs;    // 0x00000000 ---- COMPUTE_USER_DATA_10: DATA = 0x0
+        uint    privSize;       // 0x00000030 ---- COMPUTE_USER_DATA_11: DATA = 0x30
+        uint    packet4;        // 0xC0031502 -- TYPE 3, DISPATCH_DIRECT, TYPE:COMPUTE
+        uint    glbSizeX;       // 0x00000000
+        uint    glbSizeY;       // 0x00000000
+        uint    glbSizeZ;       // 0x00000000
+        uint    padd41;         // 0x00000021
+    } HwDispatch;
+    \n
+    static const uint WavefrontSize     = 64;
+    static const uint MaxWaveSize       = 0x400;
+    static const uint UsrRegOffset      = 0x240;
+    static const uint Pm4Nop            = 0xC0001002;
+    static const uint Pm4UserRegs       = 0xC0007602;
+    static const uint Pm4CopyReg        = 0xC0044000;
+    static const uint PrivateSegEna     = 0x1;
+    static const uint DispatchEna       = 0x2;
+    static const uint QueuePtrEna       = 0x4;
+    static const uint KernelArgEna      = 0x8;
+    static const uint FlatScratchEna    = 0x20;
+    \n
+    uint GetCmdTemplateHeaderSize() { return sizeof(HwDispatchHeader); }
+    \n
+    uint GetCmdTemplateDispatchSize() { return sizeof(HwDispatch); }
+    \n
+    void EmptyCmdTemplateDispatch(ulong cmdBuf)
+    {
+        volatile __global HwDispatch* dispatch = (volatile __global HwDispatch*)cmdBuf;
+        dispatch->glbSizeX = 0;
+        dispatch->glbSizeY = 0;
+        dispatch->glbSizeZ = 0;
+    }
+    \n
+    void RunCmdTemplateDispatch(
+        ulong   cmdBuf,
+        __global HsaAqlDispatchPacket* aqlPkt,
+        ulong   scratch,
+        ulong   hsaQueue,
+        uint    scratchSize,
+        uint    scratchOffset,
+        uint    numMaxWaves,
+        uint    useATC)
+    \n
+    {
+        volatile __global HwDispatch* dispatch = (volatile __global HwDispatch*)cmdBuf;
+        uint usrRegCnt = 0;
+
+        // Program workgroup size
+        dispatch->wrkGrpSizeX = aqlPkt->workgroup_size[0];
+        dispatch->wrkGrpSizeY = aqlPkt->workgroup_size[1];
+        dispatch->wrkGrpSizeZ = aqlPkt->workgroup_size[2];
+
+        // ISA address
+        __global AmdKernelCode* kernelObj = (__global AmdKernelCode*)aqlPkt->kernel_object_address;
+        ulong isa = aqlPkt->kernel_object_address + kernelObj->kernel_code_entry_byte_offset;
+
+        dispatch->isaLo = (uint)(isa >> 8);
+        dispatch->isaHi = (uint)(isa >> 40) | (useATC ? 0x100 : 0);
+
+        // Program PGM resource registers
+        dispatch->resource1 = kernelObj->compute_pgm_rsrc1;
+        dispatch->resource2 = kernelObj->compute_pgm_rsrc2;
+
+        uint    flags = kernelObj->kernel_code_properties;
+        uint    privateSize = kernelObj->workitem_private_segment_byte_size;
+
+        uint ldsSize = aqlPkt->group_segment_size_bytes +
+            kernelObj->workgroup_group_segment_byte_size;
+
+        // Align up the LDS blocks 128 * 4(in DWORDs)
+        uint ldsBlocks = (ldsSize + 511) >> 9;
+
+        dispatch->resource2 |= (ldsBlocks << 15);
+
+        // Private/scratch segment was enabled
+        if (flags & PrivateSegEna) {
+            uint    waveSize = privateSize * WavefrontSize;
+            // 256 DWRODs is the minimum for SQ
+            waveSize = max(MaxWaveSize, waveSize);
+
+            uint numWaves = scratchSize / waveSize;
+
+            numWaves = min(numWaves, numMaxWaves);
+
+            dispatch->ringSize = numWaves;
+            dispatch->ringSize |= (waveSize >> 10) << 12;
+            dispatch->user0 = Pm4UserRegs | (4 << 16);
+            dispatch->scratchLo = (uint)scratch;
+            dispatch->scratchHi = ((uint)(scratch >> 32)) | 0x80000000; // Enables swizzle
+            dispatch->scratchSize = scratchSize;
+            usrRegCnt += 4;
+        }
+        else {
+            dispatch->ringSize = 0;
+            dispatch->user0 = Pm4Nop | (4 << 16);
+        }
+
+        // Pointer to the AQL dispatch packet
+        dispatch->user1 = (flags & DispatchEna) ? (Pm4UserRegs | (2 << 16)) : (Pm4Nop | (2 << 16));
+        dispatch->offsUser1 = UsrRegOffset + usrRegCnt;
+        usrRegCnt += (flags & DispatchEna) ? 2 : 0;
+        ulong  gpuAqlPtr = (ulong)aqlPkt;
+        dispatch->aqlPtrLo = (uint)gpuAqlPtr;
+        dispatch->aqlPtrHi = (uint)(gpuAqlPtr >> 32);
+
+        // Pointer to the AQL queue header
+        if (flags & QueuePtrEna) {
+            dispatch->user2 = Pm4UserRegs | (2 << 16);
+            dispatch->offsUser2 = UsrRegOffset + usrRegCnt;
+            usrRegCnt += 2;
+            dispatch->hsaQueueLo = (uint)hsaQueue;
+            dispatch->hsaQueueHi = (uint)(hsaQueue >> 32);
+        }
+        else {
+            dispatch->user2 = Pm4Nop | (2 << 16);
+        }
+
+        // Pointer to the AQL kernel arguments
+        dispatch->user3 = (flags & KernelArgEna) ? (Pm4UserRegs | (2 << 16)) : (Pm4Nop | (2 << 16));
+        dispatch->offsUser3 = UsrRegOffset + usrRegCnt;
+        usrRegCnt += (flags & KernelArgEna) ? 2 : 0;
+        dispatch->argsLo = (uint)aqlPkt->kernel_arg_address;
+        dispatch->argsHi = (uint)(aqlPkt->kernel_arg_address >> 32);
+
+        // Provide pointer to the private/scratch buffer for the flat address
+        if (flags & FlatScratchEna) {
+            dispatch->copyData = Pm4CopyReg;
+            dispatch->scratchAddrLo = (uint)((scratch - scratchOffset) >> 16);
+            dispatch->offsUser4 = UsrRegOffset + usrRegCnt;
+            dispatch->scratchOffs = scratchOffset;
+            dispatch->privSize = privateSize;
+        }
+        else {
+            dispatch->copyData = Pm4Nop | (8 << 16);
+        }
+
+        // Update the global launch grid
+        dispatch->glbSizeX = aqlPkt->grid_size[0];
+        dispatch->glbSizeY = aqlPkt->grid_size[1];
+        dispatch->glbSizeZ = aqlPkt->grid_size[2];
+    }
+    \n
+    __kernel void scheduler(
+        __global void * queue,
+        __global void * params,
+        uint paramIdx)
+    {
+        __amd_scheduler(queue, params, paramIdx);
+    }
+    \n
+    \x23 endif
+    \n
+    );
+
+enum {
+  BlitCopyImage = 0,
+  BlitCopyImage1DA,
+  BlitCopyImageToBuffer,
+  BlitCopyBufferToImage,
+  BlitCopyBufferRect,
+  BlitCopyBufferRectAligned,
+  BlitCopyBuffer,
+  BlitCopyBufferAligned,
+  FillBuffer,
+  FillImage,
+  Scheduler,
+  BlitTotal
+};
+
+static const char* BlitName[BlitTotal] = {
+    "copyImage",         "copyImage1DA",      "copyImageToBuffer",
+    "copyBufferToImage", "copyBufferRect",    "copyBufferRectAligned",
+    "copyBuffer",        "copyBufferAligned", "fillBuffer",
+    "fillImage",         "scheduler",
+};
+
+OCLBlitKernel::OCLBlitKernel() { _numSubTests = 1; }
+
+OCLBlitKernel::~OCLBlitKernel() {}
+
+void OCLBlitKernel::open(unsigned int test, char* units, double& conversion,
+                         unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  char dbuffer[1024] = {0};
+  CPerfCounter timer;
+  int sub = 0;
+  std::string options = "-cl-std=CL2.0 -DOCL20=1";
+
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    testDescString = "GPU device is required for this test!\n";
+    return;
+  }
+
+  size_t param_size = 0;
+  char* strVersion = 0;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
+                                     0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strVersion = new char[param_size];
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
+                                     param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strVersion[7] < '2') {
+    options = "-DOCL20=0";
+    sub = 1;
+    delete strVersion;
+    testDescString = "Currently it works for OCL20 devices only!\n";
+    return;
+  }
+  delete strVersion;
+
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DRIVER_VERSION, 0,
+                                     0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strVersion = new char[param_size];
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DRIVER_VERSION,
+                                     param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  std::string sch = strKernel;
+  static const char AmdScheduler[] = "amd_scheduler";
+  static const char AmdSchedulerPal[] = "amd_scheduler_pal";
+  static const char AmdSchedulerROCm[] = "amd_scheduler_rocm";
+  const char* AmdSchedulerPatch = NULL;
+  size_t loc = 0;
+
+  if (NULL != strstr(strVersion, "LC")) {
+    if (NULL != strstr(strVersion, "PAL")) {
+      AmdSchedulerPatch = AmdSchedulerPal;
+    } else if (NULL != strstr(strVersion, "HSA")) {
+      AmdSchedulerPatch = AmdSchedulerROCm;
+    }
+  }
+  delete strVersion;
+
+  if (NULL != AmdSchedulerPatch) {
+    loc = sch.find(AmdScheduler);
+    sch.replace(loc, strlen(AmdScheduler), AmdSchedulerPatch);
+    loc = sch.find(AmdScheduler, (loc + strlen(AmdSchedulerPatch)));
+    sch.replace(loc, strlen(AmdScheduler), AmdSchedulerPatch);
+  }
+
+  timer.Reset();
+  timer.Start();
+
+  const char* strProgram = sch.c_str();
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strProgram, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    options.c_str(), NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  cl_kernel kernels[BlitTotal];
+  for (int i = 0; i < BlitTotal - sub; ++i) {
+    kernels[i] = _wrapper->clCreateKernel(program_, BlitName[i], &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+  }
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  time_ = (float)sec * 1000.f;
+  testDescString = "Blit kernel compilaiton time (ms):";
+
+  for (int i = 0; i < BlitTotal - sub; ++i) {
+    _wrapper->clReleaseKernel(kernels[i]);
+  }
+}
+
+void OCLBlitKernel::run(void) { _perfInfo = time_; }
+
+unsigned int OCLBlitKernel::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLBlitKernel.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLBlitKernel.h
new file mode 100644
index 0000000000..4f2d90957d
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLBlitKernel.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_BLIT_KERNEL_H_
+#define _OCL_BLIT_KERNEL_H_
+
+#include "OCLTestImp.h"
+
+class OCLBlitKernel : public OCLTestImp {
+ public:
+  OCLBlitKernel();
+  virtual ~OCLBlitKernel();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  float time_;
+};
+
+#endif  // _OCL_BLIT_KERNEL_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLBufferFromImage.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLBufferFromImage.cpp
new file mode 100644
index 0000000000..5278fe3998
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLBufferFromImage.cpp
@@ -0,0 +1,289 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLBufferFromImage.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+#define GROUP_SIZE 256
+
+const static char strKernel[] =
+    "__kernel void buffer2bufferCopy(                                          "
+    "         \n"
+    "    __global char* input,                                                 "
+    "          \n"
+    "    __global char* output)                                                "
+    "          \n"
+    "{                                                                         "
+    "         \n"
+    "    int coord = (int)(get_global_id(0));                                  "
+    "          \n"
+    "    output[coord] = input[coord];                                         "
+    "          \n"
+    "}                                                                         "
+    "         \n";
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *clCreateBufferFromImageAMD_fn)(
+    cl_context context, cl_mem image, cl_int *errcode_ret);
+clCreateBufferFromImageAMD_fn clCreateBufferFromImageAMD;
+
+OCLBufferFromImage::OCLBufferFromImage() : OCLTestImp() {
+  _numSubTests = 2;
+  blockSizeX = GROUP_SIZE;
+  blockSizeY = 1;
+}
+
+OCLBufferFromImage::~OCLBufferFromImage() {}
+
+void OCLBufferFromImage::open(unsigned int test, char *units,
+                              double &conversion, unsigned int deviceId) {
+  buffer = bufferImage = clImage2D = bufferOut = NULL;
+  done = false;
+  pitchAlignment = 0;
+  bufferSize = 0;
+
+  _openTest = test;
+  // Initialize random number seed
+  srand((unsigned int)time(NULL));
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    testDescString = "GPU device is required for this test!\n";
+    done = true;
+    return;
+  }
+
+  clCreateBufferFromImageAMD =
+      (clCreateBufferFromImageAMD_fn)clGetExtensionFunctionAddressForPlatform(
+          platform_, "clCreateBufferFromImageAMD");
+  if (clCreateBufferFromImageAMD == NULL) {
+    testDescString = "clCreateBufferFromImageAMD not found!\n";
+    done = true;
+    return;
+  }
+
+  CompileKernel();
+  AllocateOpenCLBuffer();
+}
+
+void OCLBufferFromImage::run(void) {
+  if (_errorFlag || done) {
+    return;
+  }
+
+  if ((_openTest % 2) == 0) {
+    testReadBuffer(bufferImage);
+  } else {
+    testKernel();
+  }
+}
+
+void OCLBufferFromImage::AllocateOpenCLBuffer() {
+  cl_int status = 0;
+
+  size_t size = 0;
+  pitchAlignment = 0;
+  status = _wrapper->clGetDeviceInfo(devices_[_deviceId],
+                                     CL_DEVICE_IMAGE_PITCH_ALIGNMENT,
+                                     sizeof(cl_uint), &pitchAlignment, &size);
+  pitchAlignment--;
+
+  const unsigned int requiredPitch =
+      ((imageWidth + pitchAlignment) & ~pitchAlignment);
+  const unsigned int pitch = requiredPitch;
+  bufferSize = pitch * imageHeight;
+
+  unsigned char *sourceData = new unsigned char[bufferSize];
+
+  // init data
+  for (unsigned int y = 0; y < bufferSize; y++) {
+    *(sourceData + y) = y;
+  }
+  buffer = _wrapper->clCreateBuffer(context_,
+                                    CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE,
+                                    bufferSize, sourceData, &status);
+
+  delete[] sourceData;
+
+  const cl_image_format format = {CL_RGBA, CL_UNSIGNED_INT8};
+#if defined(CL_VERSION_2_0)
+  const cl_image_desc desc = {CL_MEM_OBJECT_IMAGE2D,
+                              imageWidth / 4,
+                              imageHeight,
+                              0,
+                              0,
+                              pitch,
+                              0,
+                              0,
+                              0,
+                              {buffer}};
+#else
+  const cl_image_desc desc = {CL_MEM_OBJECT_IMAGE2D,
+                              imageWidth / 4,
+                              imageHeight,
+                              0,
+                              0,
+                              pitch,
+                              0,
+                              0,
+                              0,
+                              buffer};
+#endif
+  clImage2D = _wrapper->clCreateImage(context_, CL_MEM_READ_WRITE, &format,
+                                      &desc, NULL, &status);
+  CHECK_RESULT(clImage2D == NULL || status != CL_SUCCESS,
+               "AllocateOpenCLImage() failed");
+
+  bufferImage = clCreateBufferFromImageAMD(context_, clImage2D, &status);
+  char c[1024];
+  _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DRIVER_VERSION, sizeof(c),
+                            &c, NULL);
+  if (status == CL_INVALID_OPERATION) {
+    testDescString =
+        "clCreateBufferFromImageAMD not supported on this device!\n";
+    done = true;
+    return;
+  }
+  CHECK_RESULT(bufferImage == NULL || status != CL_SUCCESS,
+               "clCreateBufferFromImage(bufferOut) failed");
+
+  bufferOut = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, bufferSize,
+                                       NULL, &status);
+  CHECK_RESULT(bufferOut == NULL || status != CL_SUCCESS,
+               "clCreateBuffer(bufferOut) failed");
+}
+
+void OCLBufferFromImage::testReadBuffer(cl_mem buffer) {
+  cl_int status = 0;
+  unsigned char *dstData = new unsigned char[bufferSize];
+
+  status = clEnqueueReadBuffer(cmdQueues_[_deviceId], buffer, 1, 0, bufferSize,
+                               dstData, 0, 0, 0);
+
+  ::clFinish(cmdQueues_[_deviceId]);
+
+  for (unsigned int y = 0; y < bufferSize; y++) {
+    if (*(dstData + y) != (unsigned char)y) {
+      CHECK_RESULT_NO_RETURN(true, "CheckCLBuffer: *(dstData+y)!=y => %i != %i",
+                             *(dstData + y), y);
+      goto cleanup;
+    }
+  }
+cleanup:
+
+  delete[] dstData;
+}
+
+void OCLBufferFromImage::testKernel() {
+  CopyOpenCLBuffer(bufferImage);
+
+  testReadBuffer(bufferOut);
+}
+
+unsigned int OCLBufferFromImage::close(void) {
+  if (bufferImage != NULL) clReleaseMemObject(bufferImage);
+  if (clImage2D != NULL) clReleaseMemObject(clImage2D);
+  if (buffer != NULL) clReleaseMemObject(buffer);
+  if (bufferOut != NULL) clReleaseMemObject(bufferOut);
+  return OCLTestImp::close();
+}
+
+void OCLBufferFromImage::CopyOpenCLBuffer(cl_mem buffer) {
+  cl_int status = 0;
+
+  // Set appropriate arguments to the kernel2D
+
+  // input buffer image
+  status = clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((status != CL_SUCCESS),
+               "CopyOpenCLBuffer() failed at "
+               "clSetKernelArg(kernel_,0,sizeof(cl_mem),&buffer)");
+  status = clSetKernelArg(kernel_, 1, sizeof(cl_mem), &bufferOut);
+  CHECK_RESULT((status != CL_SUCCESS),
+               "CopyOpenCLBuffer() failed at "
+               "clSetKernelArg(kernel_,1,sizeof(cl_mem),&bufferOut)");
+
+  // Enqueue a kernel run call.
+  size_t global_work_offset[] = {0};
+  size_t globalThreads[] = {bufferSize};
+  size_t localThreads[] = {blockSizeX};
+
+  status = clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL,
+                                  globalThreads, NULL, 0, NULL, 0);
+  CHECK_RESULT((status != CL_SUCCESS),
+               "CopyOpenCLBuffer() failed at clEnqueueNDRangeKernel");
+
+  status = clFinish(cmdQueues_[_deviceId]);
+  CHECK_RESULT((status != CL_SUCCESS), "CopyOpenCLBuffer() failed at clFinish");
+}
+
+void OCLBufferFromImage::CompileKernel() {
+  cl_int status = 0;
+
+  size_t kernelSize = sizeof(strKernel);
+  const char *strs = (const char *)&strKernel[0];
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strs,
+                                                 &kernelSize, &status);
+
+  status = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL,
+                                    NULL, NULL);
+  if (status != CL_SUCCESS) {
+    if (status == CL_BUILD_PROGRAM_FAILURE) {
+      cl_int logStatus;
+      size_t buildLogSize = 0;
+      logStatus = clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                        CL_PROGRAM_BUILD_LOG, buildLogSize,
+                                        NULL, &buildLogSize);
+      std::string buildLog;
+      buildLog.resize(buildLogSize);
+
+      logStatus = clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                        CL_PROGRAM_BUILD_LOG, buildLogSize,
+                                        &buildLog[0], NULL);
+      printf("%s", buildLog.c_str());
+    }
+    return;
+  }
+  // get a kernel object handle for a kernel with the given name
+  kernel_ = _wrapper->clCreateKernel(program_, "buffer2bufferCopy", &status);
+
+  size_t kernel2DWorkGroupSize = 0;
+  status = clGetKernelWorkGroupInfo(kernel_, devices_[_deviceId],
+                                    CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t),
+                                    &kernel2DWorkGroupSize, 0);
+
+  if ((blockSizeX * blockSizeY) > kernel2DWorkGroupSize) {
+    if (blockSizeX > kernel2DWorkGroupSize) {
+      blockSizeX = kernel2DWorkGroupSize;
+      blockSizeY = 1;
+    }
+  }
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLBufferFromImage.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLBufferFromImage.h
new file mode 100644
index 0000000000..aeab03b617
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLBufferFromImage.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCLBufferFromImage_H_
+#define _OCLBufferFromImage_H_
+
+#include "OCLTestImp.h"
+
+class OCLBufferFromImage : public OCLTestImp {
+ public:
+  OCLBufferFromImage();
+  virtual ~OCLBufferFromImage();
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ protected:
+  static const unsigned int imageWidth = 1920;
+  static const unsigned int imageHeight = 1080;
+
+  void testReadBuffer(cl_mem buffer);
+  void testKernel();
+  void AllocateOpenCLBuffer();
+  void CopyOpenCLBuffer(cl_mem buffer);
+  void CompileKernel();
+
+  bool done;
+  size_t blockSizeX; /**< Work-group size in x-direction */
+  size_t blockSizeY; /**< Work-group size in y-direction */
+  size_t bufferSize;
+  cl_mem buffer;
+  cl_mem clImage2D;
+  cl_mem bufferImage;
+  cl_mem bufferOut;
+  cl_uint pitchAlignment;
+};
+
+#endif  // _OCLBufferFromImage_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLCPUGuardPages.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCPUGuardPages.cpp
new file mode 100644
index 0000000000..e4fa6968da
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCPUGuardPages.cpp
@@ -0,0 +1,178 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLCPUGuardPages.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#ifdef _WIN32
+#include <excpt.h>
+#include <windows.h>  // for EXCEPTION_ACCESS_VIOLATION
+
+int filter(unsigned int code, struct _EXCEPTION_POINTERS* ep) {
+  printf("In filter\n");
+  if (code == EXCEPTION_ACCESS_VIOLATION) {
+    printf("caught AV as expected.");
+    return EXCEPTION_EXECUTE_HANDLER;
+  } else {
+    printf("didn't catch AV, unexpected.");
+    return EXCEPTION_CONTINUE_SEARCH;
+  };
+}
+
+#else
+#include <signal.h>
+
+#include <csignal>
+#include <cstdlib>
+#include <iostream>
+
+void segfault_sigaction(int signal, siginfo_t *si, void *arg) {
+  printf("Caught segfault at address %p\n", si->si_addr);
+  exit(0);
+}
+
+#endif
+
+const static char* strKernel =
+    "__kernel void simple_in_out_test( int in_offset, \n"
+    "                                  int out_offset, \n"
+    "                                  __global float4* in,          \n"
+    "                                  __global float4* out) { \n"
+    "unsigned int gid = get_global_id(0);\n"
+    "out[gid + out_offset] = in[gid + in_offset] * -1.f;"
+    "}";
+
+testOCLCPUGuardPagesStruct testOCLCPUGuardPagesList[] = {
+    {false, false, 1024, 0, 0}, {true, false, 1024, 0, 0},
+    {false, false, 1024, 0, 0}, {true, true, 1024, 0, 0},
+    {false, false, 1024, 0, 0}, {true, true, 1024, 0, 0}};
+
+OCLCPUGuardPages::OCLCPUGuardPages() {
+  _numSubTests =
+      sizeof(testOCLCPUGuardPagesList) / sizeof(testOCLCPUGuardPagesStruct);
+
+  /*
+      struct sigaction sa;
+
+      memset(&sa, 0, sizeof(sa));
+      sigemptyset(&sa.sa_mask);
+      sa.sa_sigaction = segfault_sigaction;
+      sa.sa_flags   = SA_SIGINFO;
+
+      sigaction(SIGSEGV, &sa, NULL);
+  */
+}
+
+OCLCPUGuardPages::~OCLCPUGuardPages() {}
+
+void OCLCPUGuardPages::open(unsigned int test, char* units, double& conversion,
+                            unsigned int deviceId) {
+  // Initialize the current test parameters.
+  testValues = testOCLCPUGuardPagesList[test];
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "simple_in_out_test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  // Create input and output buffers for the test.
+  cl_mem inBuffer, outBuffer;
+  cl_float4* dummyIn = new cl_float4[testValues.items];
+  for (int i = 0; i < testValues.items; i++) {
+    dummyIn[i].s[0] = dummyIn[i].s[1] = dummyIn[i].s[2] = dummyIn[i].s[3] =
+        i * 1.f;
+  }
+  inBuffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                      testValues.items * sizeof(cl_float4),
+                                      NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], inBuffer, 1, 0,
+                                          testValues.items * sizeof(cl_float4),
+                                          dummyIn, 0, 0, 0);
+  buffers_.push_back(inBuffer);
+
+  outBuffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                       testValues.items * sizeof(cl_float4),
+                                       NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(outBuffer);
+  delete[] dummyIn;
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLCPUGuardPages::run(void) {
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_int),
+                                    &testValues.in_offset);
+  error_ |= _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_int),
+                                     &testValues.out_offset);
+  error_ |= _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_mem), &buffers()[0]);
+  error_ |= _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_mem), &buffers()[1]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  size_t globalThreads[1];
+  globalThreads[0] = testValues.items;
+  size_t localThreads[1] = {256};
+
+#ifdef _WIN32
+  //    LPTOP_LEVEL_EXCEPTION_FILTER pOriginalFilter =
+  //    SetUnhandledExceptionFilter(MyUnhandledExceptionFilter);
+  //    AddVectoredExceptionHandler(1,MyVectorExceptionFilter);
+
+  try {
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, globalThreads, localThreads,
+                                              0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  } catch (...) {
+    printf("exception caught in OCLTest...\n");
+  }
+
+#else
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, globalThreads, localThreads,
+                                            0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+#endif
+}
+
+unsigned int OCLCPUGuardPages::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLCPUGuardPages.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCPUGuardPages.h
new file mode 100644
index 0000000000..a90451c4b7
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCPUGuardPages.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_CPU_GUARD_PAGES_H_
+#define _OCL_CPU_GUARD_PAGES_H_
+
+#include "OCLTestImp.h"
+
+typedef struct {
+  bool useGuardPages;
+  bool shouldFail;
+  int items;
+  int in_offset;
+  int out_offset;
+} testOCLCPUGuardPagesStruct;
+
+class OCLCPUGuardPages : public OCLTestImp {
+ public:
+  OCLCPUGuardPages();
+  virtual ~OCLCPUGuardPages();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  testOCLCPUGuardPagesStruct testValues;
+};
+
+#endif  // _OCL_CPU_GUARD_PAGES_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.cpp
new file mode 100644
index 0000000000..4c40ace60a
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.cpp
@@ -0,0 +1,173 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLCreateBuffer.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <sstream>
+#ifdef ATI_OS_LINUX
+#include <unistd.h>
+#endif
+
+#include "CL/cl.h"
+
+const static size_t MaxSubTests = 1;
+
+OCLCreateBuffer::OCLCreateBuffer() {
+  _numSubTests = MaxSubTests;
+  failed_ = false;
+  maxSize_ = 0;
+}
+
+OCLCreateBuffer::~OCLCreateBuffer() {}
+
+void OCLCreateBuffer::open(unsigned int test, char *units, double &conversion,
+                           unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  testID_ = test;
+
+  size_t size;
+  _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(cl_ulong), &maxSize_, &size);
+//! Workaround out of range issue in Windows 32bit apps
+#if defined(_WIN32) && !defined(_WIN64)
+  static const size_t MaxSizeLimit = 512 * 1024 * 1024;
+  if (maxSize_ > MaxSizeLimit) {
+    maxSize_ = MaxSizeLimit;
+  }
+#endif
+  cl_mem buf = NULL;
+  buf = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, maxSize_, NULL,
+                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+
+  buffers_.push_back(buf);
+}
+
+void OCLCreateBuffer::run(void) {
+  CPerfCounter timer;
+
+  cl_uchar pattern = PATTERN;
+  timer.Reset();
+  timer.Start();
+  error_ = /*_wrapper->*/ clEnqueueFillBuffer(
+      cmdQueues_[_deviceId], buffers_[0], &pattern, sizeof(pattern), 0,
+      maxSize_, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueFillBuffer() failed");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  size_t maxSteps = maxSize_;
+#ifdef ATI_OS_LINUX
+  long pages = sysconf(_SC_PHYS_PAGES);
+  long page_size = sysconf(_SC_PAGE_SIZE);
+  if (maxSteps > (size_t)(pages * page_size / 2)) {
+    maxSteps = (size_t)pages * page_size / 2;
+  }
+#endif
+  void *resultBuf = NULL;
+  ;
+  while ((resultBuf = malloc(maxSteps)) == NULL) {
+    maxSteps /= 2;
+    continue;
+  }
+
+  checkResult(maxSteps, resultBuf, pattern);
+
+  pattern += 1;
+
+  memset(resultBuf, pattern, maxSteps);
+
+  writeBuffer(maxSteps, resultBuf);
+
+  memset(resultBuf, 0x00, maxSteps);
+  checkResult(maxSteps, resultBuf, pattern);
+
+  free(resultBuf);
+
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  _perfInfo = (float)sec * 1000.f;
+  std::stringstream str;
+  str << "Max single alloc (size of ";
+  str << maxSize_;
+  str << " bytes) ";
+
+  testDescString = str.str();
+  str << "Max single read/write (size of ";
+  str << maxSize_;
+  str << " bytes) create time (ms):";
+
+  testDescString = str.str();
+}
+
+void OCLCreateBuffer::checkResult(size_t maxSteps, void *resultBuf,
+                                  cl_uchar pattern) {
+  size_t startPoint = 0;
+  while ((startPoint) < maxSize_) {
+    cl_event ee;
+    size_t readSize = maxSteps;
+    if ((startPoint + maxSteps) > maxSize_) {
+      readSize = maxSize_ - startPoint;
+    }
+    error_ = /*wrapper->*/ clEnqueueReadBuffer(
+        cmdQueues_[_deviceId], buffers_[0], CL_FALSE, startPoint, readSize,
+        resultBuf, 0, NULL, &ee);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed");
+    _wrapper->clFinish(cmdQueues_[_deviceId]);
+    size_t cnt = 0;
+    cl_uchar *cc = (cl_uchar *)resultBuf;
+    for (size_t i = 0; i < readSize; i++) {
+      if (cc[i] != pattern) {
+        cnt++;
+      }
+    }
+    if (cnt != 0) {
+      error_ = -1;
+      CHECK_RESULT((error_ != CL_SUCCESS), "checkResult() failed");
+      break;
+    }
+    startPoint += maxSteps;
+  }
+}
+
+void OCLCreateBuffer::writeBuffer(size_t maxSteps, void *dataBuf) {
+  size_t startPoint = 0;
+  while ((startPoint) < maxSize_) {
+    cl_event ee;
+    size_t writeSize = maxSteps;
+    if ((startPoint + maxSteps) > maxSize_) {
+      writeSize = maxSize_ - startPoint;
+    }
+    error_ = /*wrapper->*/ clEnqueueWriteBuffer(
+        cmdQueues_[_deviceId], buffers_[0], CL_FALSE, startPoint, writeSize,
+        dataBuf, 0, NULL, &ee);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+    _wrapper->clFinish(cmdQueues_[_deviceId]);
+    startPoint += maxSteps;
+  }
+}
+
+unsigned int OCLCreateBuffer::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.h
new file mode 100644
index 0000000000..7797563753
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_CREATE_BUFFER_H_
+#define _OCL_CREATE_BUFFER_H_
+
+#include "OCLTestImp.h"
+#define PATTERN 0x20
+
+class OCLCreateBuffer : public OCLTestImp {
+ public:
+  OCLCreateBuffer();
+  virtual ~OCLCreateBuffer();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual void writeBuffer(size_t tmpMaxSize, void* dataBuf);
+  virtual void checkResult(size_t tmpMaxSize, void* resultBuf,
+                           cl_uchar pattern);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+  unsigned int testID_;
+  cl_ulong maxSize_;
+};
+
+#endif  // _OCL_CREATE_BUFFER_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateContext.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateContext.cpp
new file mode 100644
index 0000000000..3853eeacf7
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateContext.cpp
@@ -0,0 +1,98 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLCreateContext.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+OCLCreateContext::OCLCreateContext() { _numSubTests = 1; }
+
+OCLCreateContext::~OCLCreateContext() {}
+
+void OCLCreateContext::open(unsigned int test, char *units, double &conversion,
+                            unsigned int deviceId) {
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLCreateContext::run(void) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+
+  int error = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error != CL_SUCCESS, "clGetPlatformIDs failed");
+    for (unsigned i = 0; i < numPlatforms; ++i) {
+      char pbuf[100];
+      error = _wrapper->clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR,
+                                          sizeof(pbuf), pbuf, NULL);
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        platform = platforms[i];
+        break;
+      }
+    }
+    delete platforms;
+  }
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  /* Get the number of requested devices */
+  error = _wrapper->clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL,
+                                   &num_devices);
+  CHECK_RESULT(error != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error = _wrapper->clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices,
+                                   devices, NULL);
+  CHECK_RESULT(error != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  device = devices[0];
+
+  cl_context gContext = _wrapper->clCreateContext(
+      NULL, 1, &device, notify_callback, NULL, &error);
+  CHECK_RESULT(gContext == 0, "clCreateContext failed");
+
+  error = _wrapper->clReleaseContext(gContext);
+  CHECK_RESULT(error != CL_SUCCESS, "clReleaseContext failed");
+}
+
+unsigned int OCLCreateContext::close(void) { return _crcword; }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateContext.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateContext.h
new file mode 100644
index 0000000000..bcff21868a
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateContext.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_CreateContext_H_
+#define _OCL_CreateContext_H_
+
+#include "OCLTestImp.h"
+
+class OCLCreateContext : public OCLTestImp {
+ public:
+  OCLCreateContext();
+  virtual ~OCLCreateContext();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+};
+
+#endif  // _OCL_CreateContext_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateImage.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateImage.cpp
new file mode 100644
index 0000000000..d6e385eaa4
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateImage.cpp
@@ -0,0 +1,493 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLCreateImage.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <sstream>
+#ifdef ATI_OS_LINUX
+#include <sys/sysinfo.h>
+#include <unistd.h>
+#endif
+
+#include "CL/cl.h"
+
+const static size_t ImageSize = 4;
+const static size_t MaxSubTests = 5;
+
+const static char *strKernel =
+    "const sampler_t g_Sampler =    CLK_FILTER_LINEAR |                 \n"
+    "                               CLK_ADDRESS_CLAMP_TO_EDGE |         \n"
+    "                               CLK_NORMALIZED_COORDS_FALSE;        \n"
+    "                                                                   \n"
+    "__kernel void linear3D(__read_only image3d_t img3D, __global float4* "
+    "f4Tata) \n"
+    "{                                                                  \n"
+    "   float4 f4Index = { 2.25f, 1.75f, 0.5f, 0.0f };                  \n"
+    "   // copy interpolated data in result buffer                      \n"
+    "   f4Tata[0] = read_imagef(img3D, g_Sampler, f4Index);             \n"
+    "}                                                                  \n"
+    "                                                                   \n"
+    "__kernel void linear2D(__read_only image2d_t img2D, __global float4* "
+    "f4Tata) \n"
+    "{                                                                  \n"
+    "   float2 f2Index = { 2.25f, 1.75f };                              \n"
+    "   // copy interpolated data in result buffer                      \n"
+    "   f4Tata[0] = read_imagef(img2D, g_Sampler, f2Index);             \n"
+    "}                                                                  \n"
+    "                                                                   \n"
+    "__kernel void linear1DArray(__read_only image1d_array_t img1DA, __global "
+    "float4* f4Tata) \n"
+    "{                                                                  \n"
+    "   float2 f2Index = { 2.25f, 0 };                                  \n"
+    "   // copy interpolated data in result buffer                      \n"
+    "   f4Tata[0] = read_imagef(img1DA, g_Sampler, f2Index);             \n"
+    "}                                                                  \n"
+    "                                                                   \n"
+    "__kernel void linear2DArray(__read_only image2d_array_t img2DA, __global "
+    "float4* f4Tata) \n"
+    "{                                                                  \n"
+    "   float4 f4Index = { 2.25f, 1.75f, 0.0f, 0.0f };                  \n"
+    "   // copy interpolated data in result buffer                      \n"
+    "   f4Tata[0] = read_imagef(img2DA, g_Sampler, f4Index);            \n"
+    "}                                                                  \n"
+    "                                                                   \n"
+    "__kernel void point1DBuffer(__read_only image1d_buffer_t img1DB, __global "
+    "float4* f4Tata) \n"
+    "{                                                                  \n"
+    "   int index = 2;                                                  \n"
+    "   // copy interpolated data in result buffer                      \n"
+    "   f4Tata[0] = read_imagef(img1DB, index);                         \n"
+    "}                                                                  \n"
+    "                                                                   \n";
+
+OCLCreateImage::OCLCreateImage() {
+  _numSubTests = MaxSubTests;
+  failed_ = false;
+  ImageSizeX = ImageSize;
+  ImageSizeY = ImageSize;
+  ImageSizeZ = ImageSize;
+}
+
+OCLCreateImage::~OCLCreateImage() {}
+
+void OCLCreateImage::open(unsigned int test, char *units, double &conversion,
+                          unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  testID_ = test;
+
+  cl_bool imageSupport;
+  size_t size;
+  for (size_t i = 0; i < deviceCount_; ++i) {
+    _wrapper->clGetDeviceInfo(devices_[i], CL_DEVICE_IMAGE_SUPPORT,
+                              sizeof(imageSupport), &imageSupport, &size);
+    if (!imageSupport) {
+      failed_ = true;
+      return;
+    }
+  }
+
+  cl_ulong max2DWidth;
+  cl_ulong max2DHeight;
+
+  cl_ulong max3DWidth;
+  cl_ulong max3DHeight;
+  cl_ulong max3DDepth;
+
+  _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(cl_ulong), &maxSize_, &size);
+
+  _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_IMAGE2D_MAX_WIDTH,
+                            sizeof(cl_ulong), &max2DWidth, &size);
+
+  _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_IMAGE2D_MAX_HEIGHT,
+                            sizeof(cl_ulong), &max2DHeight, &size);
+
+  _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_IMAGE3D_MAX_WIDTH,
+                            sizeof(cl_ulong), &max3DWidth, &size);
+
+  _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_IMAGE3D_MAX_HEIGHT,
+                            sizeof(cl_ulong), &max3DHeight, &size);
+
+  _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_IMAGE3D_MAX_DEPTH,
+                            sizeof(cl_ulong), &max3DDepth, &size);
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  const char *kernels[] = {"linear3D", "linear2D", "linear2DArray",
+                           "linear1DArray", "point1DBuffer"};
+  unsigned int dimensions[] = {3, 2, 3, 2, 1};
+  kernel_ = _wrapper->clCreateKernel(program_, kernels[test], &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem memory;
+  cl_mem buf = NULL;
+  cl_image_desc desc;
+  size_t offset[3] = {0, 0, 0};
+  cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
+
+  desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+  desc.image_array_size = 0;
+  desc.image_row_pitch = 0;
+  desc.image_slice_pitch = 0;
+  desc.num_mip_levels = 0;
+  desc.num_samples = 0;
+  desc.buffer = (cl_mem)NULL;
+
+  if (test == 0) {
+    desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+    if (is64BitApp()) {
+      ImageSizeX = max3DWidth;
+      ImageSizeY = maxSize_ / (ImageSizeX * 16);
+      if (ImageSizeY > (max3DHeight)) {
+        ImageSizeY = max3DHeight;
+      }
+      ImageSizeZ = maxSize_ / (ImageSizeX * ImageSizeY * 16);
+    } else {
+      ImageSizeX = 4;
+      ImageSizeY = 4;
+      ImageSizeZ = 4;
+    }
+    desc.image_width = ImageSizeX;
+    desc.image_height = ImageSizeY;
+    desc.image_depth = ImageSizeZ;
+  }
+  if (test == 1) {
+    desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+    if (is64BitApp()) {
+      ImageSizeX = max2DWidth - 0x10;
+      ImageSizeY = maxSize_ / (ImageSizeX * 16 * 2);
+      if (ImageSizeY >= max2DHeight) {
+        ImageSizeY = max2DHeight - 0x1000;
+      }
+#ifdef ATI_OS_LINUX
+      // On linux, if the size of total system memory is less than 4GB,
+      // then, we can allocate much smaller image.
+      // TODO, need to find the root cause
+      struct sysinfo myinfo;
+      unsigned long total_bytes;
+
+      sysinfo(&myinfo);
+      total_bytes = myinfo.mem_unit * myinfo.totalram;
+      if ((total_bytes / (1024 * 1024)) <= 4096) {
+        ImageSizeY /= 2;
+      }
+#endif
+    } else {
+      ImageSizeX = 4;
+      ImageSizeY = 4;
+    }
+    ImageSizeZ = 0;
+    desc.image_width = ImageSizeX;
+    desc.image_height = ImageSizeY;
+    desc.image_depth = 0;
+  } else if (test == 2) {
+    desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+    ImageSizeX = ImageSize;
+    ImageSizeY = ImageSize;
+    ImageSizeZ = ImageSize;
+    desc.image_width = ImageSizeX;
+    desc.image_height = ImageSizeY;
+    desc.image_depth = 0;
+    desc.image_array_size = ImageSize;
+  } else if (test == 3) {
+    desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
+    ImageSizeX = ImageSize;
+    ImageSizeY = ImageSize;
+    ImageSizeZ = 0;
+    desc.image_width = ImageSize;
+    desc.image_height = ImageSize;
+    desc.image_depth = 0;
+    desc.image_array_size = ImageSize;
+  } else if (test == 4) {
+    ImageSizeX = ImageSize;
+    desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+    buf = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                   ImageSizeX * 4 * sizeof(cl_float), NULL,
+                                   &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+    ImageSizeY = 0;
+    ImageSizeZ = 0;
+    desc.image_width = ImageSizeX;
+    desc.image_height = 0;
+    desc.image_depth = 0;
+    desc.buffer = buf;
+  }
+
+  memory = _wrapper->clCreateImage(context_, CL_MEM_READ_ONLY, &imageFormat,
+                                   &desc, NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateImage() failed");
+
+  float fillColor[4] = {1.f, 1.f, 1.f, 1.f};
+
+  if (dimensions[test] == 1) {
+    float data[4][ImageSize];
+    size_t region[3] = {ImageSize, 1, 1};
+
+    error_ =
+        _wrapper->clEnqueueFillImage(cmdQueues_[_deviceId], memory, fillColor,
+                                     offset, region, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueFillImage() failed");
+    error_ =
+        _wrapper->clEnqueueReadImage(cmdQueues_[_deviceId], memory, true,
+                                     offset, region, 0, 0, data, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadImage() failed");
+
+    for (size_t x = 0; x < ImageSize; ++x) {
+      if (0 != memcmp(&data[x], fillColor, sizeof(fillColor))) {
+        CHECK_RESULT(true, "Fill image validation failed");
+      }
+      data[x][0] = (float)x;
+      data[x][1] = data[x][2] = data[x][3] = 1.0f;
+    }
+    error_ = _wrapper->clEnqueueWriteImage(cmdQueues_[_deviceId], memory, true,
+                                           offset, region, 0, 0, data, 0, NULL,
+                                           NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteImage() failed");
+  } else if (dimensions[test] == 2) {
+    size_t region[3] = {ImageSizeX, ImageSizeY, 1};
+
+    error_ =
+        _wrapper->clEnqueueFillImage(cmdQueues_[_deviceId], memory, fillColor,
+                                     offset, region, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueFillImage() failed");
+
+    float *data;
+    size_t ActualImageSizeY = ImageSizeY;
+    size_t maxImageSize = maxSize_;
+#ifdef ATI_OS_LINUX
+    long pages = sysconf(_SC_PHYS_PAGES);
+    long page_size = sysconf(_SC_PAGE_SIZE);
+    if (maxImageSize > ((size_t)pages * page_size)) {
+      maxImageSize = ((size_t)pages * page_size);
+    }
+#endif
+    while ((((ImageSizeX * ActualImageSizeY * sizeof(float) * 4) /
+             (1024 * 1024)) >= (size_t)4 * 1024) ||
+           ((ImageSizeX * ActualImageSizeY * sizeof(float) * 4) >=
+            (maxImageSize / 2))) {
+      if (ActualImageSizeY == 1) {
+        break;
+      }
+      ActualImageSizeY /= 2;
+    }
+    while ((data = (float *)malloc(ImageSizeX * ActualImageSizeY *
+                                   sizeof(float) * 4)) == NULL) {
+      if (ActualImageSizeY == 1) {
+        break;
+      }
+      ActualImageSizeY /= 2;
+    }
+    if (data == NULL) {
+      CHECK_RESULT(true, "malloc() failed");
+    }
+
+    size_t remainSizeY = ImageSizeY;
+    while (remainSizeY > 0) {
+      ActualImageSizeY =
+          (remainSizeY > ActualImageSizeY) ? ActualImageSizeY : remainSizeY;
+      size_t tmpRange[3] = {ImageSizeX, ActualImageSizeY, 1};
+      error_ = _wrapper->clEnqueueReadImage(cmdQueues_[_deviceId], memory, true,
+                                            offset, tmpRange, 0, 0, data, 0,
+                                            NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadImage() failed");
+
+      for (size_t y = 0; y < ActualImageSizeY; ++y) {
+        for (size_t x = 0; x < ImageSizeX; ++x) {
+          size_t offsetData = (y * ImageSizeX + x) * 4;
+          if (0 != memcmp(&data[offsetData], fillColor, sizeof(fillColor))) {
+            CHECK_RESULT(true, "Fill image validation failed");
+          }
+          data[offsetData + 0] = (float)x;
+          data[offsetData + 1] = (float)y;
+          data[offsetData + 2] = data[offsetData + 3] = 1.0f;
+        }
+      }
+      error_ = _wrapper->clEnqueueWriteImage(cmdQueues_[_deviceId], memory,
+                                             true, offset, tmpRange, 0, 0, data,
+                                             0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteImage() failed");
+      remainSizeY -= ActualImageSizeY;
+      offset[1] += ActualImageSizeY;
+    }
+    free(data);
+  } else if (dimensions[test] == 3) {
+    float *data;
+
+    float index = 0.f;
+    size_t region[3] = {ImageSizeX, ImageSizeY, ImageSizeZ};
+    error_ =
+        _wrapper->clEnqueueFillImage(cmdQueues_[_deviceId], memory, fillColor,
+                                     offset, region, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueFillImage() failed");
+
+    size_t ActualImageSizeZ = ImageSizeZ;
+    size_t maxImageSize = maxSize_;
+#ifdef ATI_OS_LINUX
+    long pages = sysconf(_SC_PHYS_PAGES);
+    long page_size = sysconf(_SC_PAGE_SIZE);
+    if (maxImageSize > ((size_t)pages * page_size)) {
+      maxImageSize = ((size_t)pages * page_size);
+    }
+#endif
+    while ((((ImageSizeX * ImageSizeY * ActualImageSizeZ * sizeof(float) * 4) /
+             (1024 * 1024)) >= (size_t)4 * 1024) ||
+           ((ImageSizeX * ImageSizeY * ActualImageSizeZ * sizeof(float) * 4) >=
+            (maxImageSize / 2))) {
+      if (ActualImageSizeZ == 1) {
+        break;
+      }
+      ActualImageSizeZ /= 2;
+    }
+    while ((data = (float *)malloc(ImageSizeX * ImageSizeY * ActualImageSizeZ *
+                                   sizeof(float) * 4)) == NULL) {
+      if (ActualImageSizeZ == 1) {
+        break;
+      }
+      ActualImageSizeZ -= 1;
+    }
+    if (data == NULL) {
+      CHECK_RESULT(true, "malloc() failed");
+    }
+
+    size_t remainSizeZ = ImageSizeZ;
+    while (remainSizeZ > 0) {
+      ActualImageSizeZ =
+          (remainSizeZ > ActualImageSizeZ) ? ActualImageSizeZ : remainSizeZ;
+      size_t tmpRange[3] = {ImageSizeX, ImageSizeY, ActualImageSizeZ};
+      error_ = _wrapper->clEnqueueReadImage(cmdQueues_[_deviceId], memory, true,
+                                            offset, tmpRange, 0, 0, data, 0,
+                                            NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadImage() failed");
+
+      for (size_t z = 0; z < ActualImageSizeZ; ++z) {
+        for (size_t y = 0; y < ImageSizeY; ++y) {
+          for (size_t x = 0; x < ImageSizeX; ++x) {
+            size_t offset = (((z * ImageSizeY) + y) * ImageSizeX + x) * 4;
+            if (0 != memcmp(&data[offset], fillColor, sizeof(fillColor))) {
+              CHECK_RESULT(true, "Fill image validation failed");
+            }
+            data[offset + 0] = (float)x;
+            data[offset + 1] = (float)y;
+            data[offset + 2] = (float)z;
+            data[offset + 3] = 1.0f;
+          }
+        }
+      }
+      error_ = _wrapper->clEnqueueWriteImage(cmdQueues_[_deviceId], memory,
+                                             true, offset, tmpRange, 0, 0, data,
+                                             0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteImage() failed");
+      remainSizeZ -= ActualImageSizeZ;
+      offset[2] += ActualImageSizeZ;
+    }
+    free(data);
+  }
+
+  buffers_.push_back(memory);
+
+  memory = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                    4 * sizeof(cl_float), NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(memory);
+  if (buf != NULL) {
+    buffers_.push_back(buf);
+  }
+  size_t imageSizebyte =
+      (ImageSizeY != 0) ? ImageSizeY * ImageSizeX : ImageSizeX;
+  imageSizebyte *= (ImageSizeZ != 0) ? ImageSizeZ : 1;
+  imageSizebyte *= 16;  //  16 bytes per pixel, imageFormat = {CL_RGBA,CL_FLOAT}
+  char strImgSize[200];
+  if (imageSizebyte >= 1024 * 1024) {
+    sprintf(strImgSize, "%5ld MB", (long)(imageSizebyte / (1024 * 1024)));
+  } else {
+    sprintf(strImgSize, "%6ld Bytes", (long)imageSizebyte);
+  }
+  std::stringstream str;
+  str << " (";
+  str << ImageSizeX;
+  str << ", ";
+  str << ImageSizeY;
+  str << ",  ";
+  str << ImageSizeZ;
+  str << ") ";
+  str << strImgSize;
+
+  testDescString = str.str();
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLCreateImage::run(void) {
+  if (failed_) {
+    return;
+  }
+
+  cl_float values[4] = {0.f, 0.f, 0.f, 0.f};
+  cl_float ref[2] = {1.75f, 1.25f};
+  cl_mem image = buffers()[0];
+  cl_mem buffer = buffers()[1];
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &image);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  size_t gws[1] = {0x1};
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, NULL, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffer, true, 0,
+                                         4 * sizeof(cl_float), values, 0, NULL,
+                                         NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed");
+  if (testID_ == 4) {
+    ref[0] = 2.0f;
+  }
+  for (cl_uint i = 0; i < static_cast<cl_uint>((testID_ >= 3) ? 1 : 2); ++i) {
+    if (values[i] != ref[i]) {
+      printf("%.2f != %.2f [ref]", values[i], ref[i]);
+      CHECK_RESULT(true, " - Incorrect result for linear filtering!\n");
+    }
+  }
+}
+
+unsigned int OCLCreateImage::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateImage.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateImage.h
new file mode 100644
index 0000000000..95347a5f6c
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateImage.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_CREATE_IMAGE_H_
+#define _OCL_CREATE_IMAGE_H_
+
+#include "OCLTestImp.h"
+
+class OCLCreateImage : public OCLTestImp {
+ public:
+  OCLCreateImage();
+  virtual ~OCLCreateImage();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+  unsigned int testID_;
+  size_t maxSize_;
+  size_t ImageSizeX;
+  size_t ImageSizeY;
+  size_t ImageSizeZ;
+
+  bool is64BitApp() { return sizeof(int*) == 8; }
+};
+
+#endif  // _OCL_CREATE_IMAGE_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceAtomic.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceAtomic.cpp
new file mode 100644
index 0000000000..7d5a94aedb
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceAtomic.cpp
@@ -0,0 +1,210 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLDeviceAtomic.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+static const cl_uint TotalElements = 256 * 1024 * 1024;
+static const cl_uint ArraySize = 256;
+static cl_uint hostArray[ArraySize];
+
+#define KERNEL_CODE(...) #__VA_ARGS__
+
+const static char* strKernel[] = {
+    KERNEL_CODE(
+    \n __kernel void atomic_test1(__global uint* res) {
+      __global atomic_uint* inc = (__global atomic_uint*)res;
+      atomic_fetch_add_explicit(inc, 1, memory_order_acq_rel,
+                                memory_scope_device);
+    }
+    \n __kernel void atomic_test2(__global uint* res) {
+      __global atomic_uint* inc = (__global atomic_uint*)res;
+      atomic_fetch_add_explicit(inc, 1, memory_order_acq_rel,
+                                memory_scope_device);
+    }
+    \n),
+    KERNEL_CODE(
+    \n __kernel void atomic_test1(__global uint* res) {
+      for (uint i = 0; i < 256 * 1024; ++i) {
+        for (uint j = 0; j < 256; ++j) {
+          __global atomic_uint* inc = (__global atomic_uint*)&res[j];
+          uint val = atomic_load_explicit(inc, memory_order_acquire,
+                                          memory_scope_device);
+          if (0 != val) {
+            res[1] = get_global_id(0);
+            res[2] = i;
+            return;
+          }
+        }
+      }
+    }
+    \n __kernel void atomic_test2(__global uint* res) {
+      if (get_global_id(0) == 64 * 1000 * 1000) {
+        __global atomic_uint* inc = (__global atomic_uint*)res;
+        // atomic_fetch_add_explicit(inc, 1, memory_order_acq_rel,
+        // memory_scope_device);
+        atomic_store_explicit(inc, get_global_id(0), memory_order_release,
+                              memory_scope_device);
+      }
+    }
+    \n)};
+
+OCLDeviceAtomic::OCLDeviceAtomic()
+    : hostQueue_(NULL), failed_(false), kernel2_(NULL) {
+  _numSubTests = 2;
+}
+
+OCLDeviceAtomic::~OCLDeviceAtomic() {}
+
+void OCLDeviceAtomic::open(unsigned int test, char* units, double& conversion,
+                           unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  testID_ = test;
+  size_t param_size = 0;
+  char* strVersion = 0;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
+                                     0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strVersion = new char[param_size];
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
+                                     param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strVersion[7] < '2') {
+    failed_ = true;
+    return;
+  }
+  delete strVersion;
+
+  char dbuffer[1024] = {0};
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel[test],
+                                                 NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "atomic_test1", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  kernel2_ = _wrapper->clCreateKernel(program_, "atomic_test2", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+  memset(hostArray, 0, sizeof(hostArray));
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_COPY_HOST_PTR,
+                                    sizeof(hostArray), &hostArray, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+
+#if defined(CL_VERSION_2_0)
+  const cl_queue_properties cprops[] = {CL_QUEUE_PROPERTIES,
+                                        static_cast<cl_queue_properties>(0), 0};
+  hostQueue_ = _wrapper->clCreateCommandQueueWithProperties(
+      context_, devices_[deviceId], cprops, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateCommandQueueWithProperties() failed");
+#endif
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLDeviceAtomic::run(void) {
+  if (failed_) return;
+  cl_mem buffer = buffers()[0];
+
+  size_t gws[1] = {TotalElements};
+  size_t gws2[1] = {1};
+  size_t gws3[1] = {TotalElements};
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  if (testID_ == 0) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, gws, NULL, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  } else {
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, gws2, NULL, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  }
+
+  error_ = _wrapper->clSetKernelArg(kernel2_, 0, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  if (testID_ == 0) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(hostQueue_, kernel2_, 1, NULL,
+                                              gws, NULL, 0, NULL, NULL);
+  } else {
+    error_ = _wrapper->clEnqueueNDRangeKernel(hostQueue_, kernel2_, 1, NULL,
+                                              gws3, NULL, 0, NULL, NULL);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+  _wrapper->clFlush(cmdQueues_[_deviceId]);
+  _wrapper->clFlush(hostQueue_);
+
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  _wrapper->clFinish(hostQueue_);
+
+  error_ = _wrapper->clEnqueueReadBuffer(hostQueue_, buffer, CL_TRUE, 0,
+                                         sizeof(hostArray), hostArray, 0, NULL,
+                                         NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed");
+
+  if (testID_ == 0) {
+    if (hostArray[0] != 2 * TotalElements) {
+      printf("Counter: %d, expected: %d\n", hostArray[0], 2 * TotalElements);
+      CHECK_RESULT(true, "Incorrect result for device atomic inc!\n");
+    }
+  } else {
+    printf("Value: %d, thread: %d, iter: %d\n", hostArray[0], hostArray[1],
+           hostArray[2]);
+    if (hostArray[0] == 0) {
+      CHECK_RESULT(true, "Incorrect result for device atomic inc!\n");
+    }
+  }
+}
+
+unsigned int OCLDeviceAtomic::close(void) {
+  if (NULL != hostQueue_) {
+    _wrapper->clReleaseCommandQueue(hostQueue_);
+  }
+  if (NULL != kernel2_) {
+    _wrapper->clReleaseKernel(kernel2_);
+  }
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceAtomic.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceAtomic.h
new file mode 100644
index 0000000000..7bb69ef1a1
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceAtomic.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_DEVICE_ATOMIC_H_
+#define _OCL_DEVICE_ATOMIC_H_
+
+#include "OCLTestImp.h"
+
+class OCLDeviceAtomic : public OCLTestImp {
+ public:
+  OCLDeviceAtomic();
+  virtual ~OCLDeviceAtomic();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  cl_command_queue hostQueue_;
+  bool failed_;
+  cl_kernel kernel2_;
+  unsigned int testID_;
+};
+
+#endif  // _OCL_DEVICE_ATOMIC_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.cpp
new file mode 100644
index 0000000000..b233cb41cb
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.cpp
@@ -0,0 +1,288 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLDeviceQueries.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+struct AMDDeviceInfo {
+  const char* targetName_;        //!< Target name
+  const char* machineTarget_;     //!< Machine target
+  cl_uint simdPerCU_;             //!< Number of SIMDs per CU
+  cl_uint simdWidth_;             //!< Number of workitems processed per SIMD
+  cl_uint simdInstructionWidth_;  //!< Number of instructions processed per SIMD
+  cl_uint memChannelBankWidth_;   //!< Memory channel bank width
+  cl_uint localMemSizePerCU_;     //!< Local memory size per CU
+  cl_uint localMemBanks_;         //!< Number of banks of local memory
+  cl_uint gfxipMajor_;            //!< GFXIP major number
+  cl_uint gfxipMinor_;            //!< GFXIP minor number
+};
+
+static const cl_uint Ki = 1024;
+static const AMDDeviceInfo DeviceInfo[] = {
+    // targetName  machineTarget
+    /* CAL_TARGET_600 */ {"", "", 0, 0, 0, 0, 0, 0, 0, 0},
+    /* CAL_TARGET_610 */ {"", "", 0, 0, 0, 0, 0, 0, 0, 0},
+    /* CAL_TARGET_630 */ {"", "", 0, 0, 0, 0, 0, 0, 0, 0},
+    /* CAL_TARGET_670 */ {"", "", 0, 0, 0, 0, 0, 0, 0, 0},
+    /* CAL_TARGET_7XX */ {"", "", 0, 0, 0, 0, 0, 0, 0, 0},
+    /* CAL_TARGET_770 */ {"", "", 0, 0, 0, 0, 0, 0, 0, 0},
+    /* CAL_TARGET_710 */ {"", "", 0, 0, 0, 0, 0, 0, 0, 0},
+    /* CAL_TARGET_730 */ {"", "", 0, 0, 0, 0, 0, 0, 0, 0},
+    /* CAL_TARGET_CYPRESS */
+    {"Cypress", "cypress", 1, 16, 5, 256, 32 * Ki, 32, 4, 0},
+    /* CAL_TARGET_JUNIPER */
+    {"Juniper", "juniper", 1, 16, 5, 256, 32 * Ki, 32, 4, 0},
+    /* CAL_TARGET_REDWOOD */
+    {"Redwood", "redwood", 1, 16, 5, 256, 32 * Ki, 16, 4, 0},
+    /* CAL_TARGET_CEDAR */ {"Cedar", "cedar", 1, 8, 5, 256, 32 * Ki, 16, 4, 0},
+    /* CAL_TARGET_SUMO */
+    {"WinterPark", "redwood", 1, 16, 5, 256, 32 * Ki, 16, 4, 0},
+    /* CAL_TARGET_SUPERSUMO*/
+    {"BeaverCreek", "redwood", 1, 16, 5, 256, 32 * Ki, 16, 4, 0},
+    /* CAL_TARGET_WRESTLER*/
+    {"Loveland", "cedar", 1, 8, 5, 256, 32 * Ki, 16, 4, 0},
+    /* CAL_TARGET_CAYMAN */
+    {"Cayman", "cayman", 1, 16, 4, 256, 32 * Ki, 32, 5, 0},
+    /* CAL_TARGET_KAUAI */ {"", "", 1, 16, 5, 256, 32 * Ki, 32, 4, 0},
+    /* CAL_TARGET_BARTS */ {"Barts", "barts", 1, 16, 5, 256, 32 * Ki, 32, 4, 0},
+    /* CAL_TARGET_TURKS */ {"Turks", "turks", 1, 16, 5, 256, 32 * Ki, 32, 4, 0},
+    /* CAL_TARGET_CAICOS */
+    {"Caicos", "caicos", 1, 16, 5, 256, 32 * Ki, 32, 4, 0},
+    /* CAL_TARGET_TAHITI */
+    {"Tahiti", "tahiti", 4, 16, 1, 256, 64 * Ki, 32, 6, 0},
+    /* CAL_TARGET_PITCAIRN */
+    {"Pitcairn", "pitcairn", 4, 16, 1, 256, 64 * Ki, 32, 6, 0},
+    /* CAL_TARGET_CAPEVERDE */
+    {"Capeverde", "capeverde", 4, 16, 1, 256, 64 * Ki, 32, 6, 0},
+    /* CAL_TARGET_DEVASTATOR */
+    {"Devastator", "trinity", 1, 16, 4, 256, 32 * Ki, 32, 5, 0},
+    /* CAL_TARGET_SCRAPPER */
+    {"Scrapper", "trinity", 1, 16, 4, 256, 32 * Ki, 32, 5, 0},
+    /* CAL_TARGET_OLAND */ {"Oland", "oland", 4, 16, 1, 256, 64 * Ki, 32, 6, 0},
+    /* CAL_TARGET_BONAIRE */
+    {"Bonaire", "bonaire", 4, 16, 1, 256, 64 * Ki, 32, 7, 2},
+    /* CAL_TARGET_SPECTRE */
+    {"Spectre", "spectre", 4, 16, 1, 256, 64 * Ki, 32, 7, 1},
+    /* CAL_TARGET_SPOOKY */
+    {"Spooky", "spooky", 4, 16, 1, 256, 64 * Ki, 32, 7, 1},
+    /* CAL_TARGET_KALINDI */
+    {"Kalindi", "kalindi", 4, 16, 1, 256, 64 * Ki, 32, 7, 2},
+    /* CAL_TARGET_HAINAN */
+    {"Hainan", "hainan", 4, 16, 1, 256, 64 * Ki, 32, 6, 0},
+    /* CAL_TARGET_HAWAII */
+    {"Hawaii", "hawaii", 4, 16, 1, 256, 64 * Ki, 32, 7, 2},
+    /* CAL_TARGET_ICELAND */
+    {"Iceland", "iceland", 4, 16, 1, 256, 64 * Ki, 32, 8, 0},
+    /* CAL_TARGET_TONGA */ {"Tonga", "tonga", 4, 16, 1, 256, 64 * Ki, 32, 8, 0},
+    /* CAL_TARGET_MULLINS */
+    {"Mullins", "mullins", 4, 16, 1, 256, 64 * Ki, 32, 7, 2},
+    /* CAL_TARGET_FIJI */ {"Fiji", "fiji", 4, 16, 1, 256, 64 * Ki, 32, 8, 0},
+    /* CAL_TARGET_CARRIZO */
+    {"Carrizo", "carrizo", 4, 16, 1, 256, 64 * Ki, 32, 8, 0},
+    /* CAL_TARGET_CARRIZO */
+    {"Bristol Ridge", "carrizo", 4, 16, 1, 256, 64 * Ki, 32, 8, 0},
+    /* CAL_TARGET_Ellesmere */
+    {"Ellesmere", "ellesmere", 4, 16, 1, 256, 64 * Ki, 32, 8, 0},
+    /* CAL_TARGET_BAFFIN */
+    {"Baffin", "baffin", 4, 16, 1, 256, 64 * Ki, 32, 8, 0},
+    /* ROCM Kaveri */ {"gfx700", "gfx700", 4, 16, 1, 256, 64 * Ki, 32, 7, 1},
+    /* ROCM Hawaii */ {"gfx701", "gfx701", 4, 16, 1, 256, 64 * Ki, 32, 7, 2},
+    /* ROCM Kabini */ {"gfx703", "gfx703", 4, 16, 1, 256, 64 * Ki, 32, 7, 2},
+    /* ROCM Iceland */ {"gfx800", "gfx800", 4, 16, 1, 256, 64 * Ki, 32, 8, 0},
+    /* ROCM Carrizo */ {"gfx801", "gfx801", 4, 16, 1, 256, 64 * Ki, 32, 8, 0},
+    /* ROCM Tonga */ {"gfx802", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 8, 0},
+    /* ROCM Fiji  */ {"gfx803", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 8, 0},
+    /* Vega10 */ {"gfx900", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 9, 0},
+    /* CAL_TARGET_STONEY */
+    {"Stoney", "stoney", 4, 16, 1, 256, 64 * Ki, 32, 8, 0},
+    /* CAL_TARGET_LEXA */
+    {"gfx804", "gfx804", 4, 16, 1, 256, 64 * Ki, 32, 8, 0},
+    /* Vega10_XNACK */ {"gfx901", "gfx901", 4, 16, 1, 256, 64 * Ki, 32, 9, 0},
+    /* Raven */ {"gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 9, 0},
+    /* ROCM Raven_XNACK */
+    {"gfx902-xnack", "gfx902-xnack", 4, 16, 1, 256, 64 * Ki, 32, 9, 0},
+    /* Raven_XNACK */ {"gfx903", "gfx903", 4, 16, 1, 256, 64 * Ki, 32, 9, 0},
+    /* Vega12      */ {"gfx904", "gfx904", 4, 16, 1, 256, 64 * Ki, 32, 9, 0},
+    /* Vega12_XNACK */ {"gfx905", "gfx905", 4, 16, 1, 256, 64 * Ki, 32, 9, 0},
+    /* Vega20 */ {"gfx906", "gfx906", 4, 16, 1, 256, 64 * Ki, 32, 9, 0},
+    /* Vega20 */
+    {"gfx906+sram-ecc", "gfx906+sram-ecc", 4, 16, 1, 256, 64 * Ki, 32, 9, 0},
+    /* Vega20_XNACK */ {"gfx907", "gfx907", 4, 16, 1, 256, 64 * Ki, 32, 9, 0},
+    /* MI100 */ {"gfx908", "gfx908", 4, 16, 1, 256, 64 * Ki, 32, 9, 0},
+    /* MI100 */
+    {"gfx908+sram-ecc", "gfx908+sram-ecc", 4, 16, 1, 256, 64 * Ki, 32, 9, 0},
+    /* Navi10 */ {"gfx1010", "gfx1010", 4, 32, 1, 256, 64 * Ki, 32, 10, 1},
+    /* Navi12 */ {"gfx1011", "gfx1011", 4, 32, 1, 256, 64 * Ki, 32, 10, 1},
+    /* Navi14 */ {"gfx1012", "gfx1012", 4, 32, 1, 256, 64 * Ki, 32, 10, 1},
+};
+
+const int DeviceInfoSize = sizeof(DeviceInfo) / sizeof(AMDDeviceInfo);
+
+OCLDeviceQueries::OCLDeviceQueries() {
+  _numSubTests = 1;
+  failed_ = false;
+}
+
+OCLDeviceQueries::~OCLDeviceQueries() {}
+
+void OCLDeviceQueries::open(unsigned int test, char* units, double& conversion,
+                            unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  char name[1024] = {0};
+  size_t size = 0;
+
+  if (deviceId >= deviceCount_) {
+    failed_ = true;
+    return;
+  }
+  cl_uint value;
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    printf("GPU device is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+
+  _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 1024,
+                            name, &size);
+  if (!strstr(name, "cl_amd_device_attribute_query")) {
+    printf("AMD device attribute  extension is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_NAME,
+                                     sizeof(name), name, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_NAME failed");
+
+  std::string str = name;
+  int id = 0;
+  bool deviceFound = false;
+  for (int i = 0; i < DeviceInfoSize; ++i) {
+    if (0 == str.compare(DeviceInfo[i].targetName_)) {
+      deviceFound = true;
+      id = i;
+      break;
+    }
+  }
+  CHECK_RESULT(deviceFound != true, "Device %s is not supported", name);
+
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId],
+                                     CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD,
+                                     sizeof(cl_uint), &value, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD failed");
+  CHECK_RESULT((value != DeviceInfo[id].simdPerCU_),
+               "CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD failed");
+
+  error_ =
+      _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_SIMD_WIDTH_AMD,
+                                sizeof(cl_uint), &value, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_SIMD_WIDTH_AMD failed");
+  CHECK_RESULT((value != DeviceInfo[id].simdWidth_),
+               "CL_DEVICE_SIMD_WIDTH_AMD failed");
+
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId],
+                                     CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD,
+                                     sizeof(cl_uint), &value, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD failed");
+  CHECK_RESULT((value != DeviceInfo[id].simdInstructionWidth_),
+               "CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD failed");
+
+  error_ = _wrapper->clGetDeviceInfo(
+      devices_[deviceId], CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD,
+      sizeof(cl_uint), &value, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD failed");
+  CHECK_RESULT((value != DeviceInfo[id].memChannelBankWidth_),
+               "CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD failed");
+
+  error_ = _wrapper->clGetDeviceInfo(
+      devices_[deviceId], CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD,
+      sizeof(cl_uint), &value, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD failed");
+  CHECK_RESULT((value != DeviceInfo[id].localMemSizePerCU_),
+               "CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD failed");
+
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId],
+                                     CL_DEVICE_LOCAL_MEM_BANKS_AMD,
+                                     sizeof(cl_uint), &value, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_LOCAL_MEM_BANKS_AMD failed");
+  CHECK_RESULT((value != DeviceInfo[id].localMemBanks_),
+               "CL_DEVICE_LOCAL_MEM_BANKS_AMD failed");
+
+  error_ =
+      _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_GFXIP_MAJOR_AMD,
+                                sizeof(cl_uint), &value, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_GFXIP_MAJOR_AMD failed");
+  CHECK_RESULT((value != DeviceInfo[id].gfxipMajor_),
+               "CL_DEVICE_GFXIP_MAJOR_AMD failed");
+
+  error_ =
+      _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_GFXIP_MINOR_AMD,
+                                sizeof(cl_uint), &value, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_GFXIP_MINOR_AMD failed");
+
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId],
+                                     CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD,
+                                     sizeof(cl_uint), &value, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD failed");
+  CHECK_RESULT((value == 0), "CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD failed");
+
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId],
+                                     CL_DEVICE_WAVEFRONT_WIDTH_AMD,
+                                     sizeof(cl_uint), &value, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_WAVEFRONT_WIDTH_AMD failed");
+  CHECK_RESULT((value == 0), "CL_DEVICE_WAVEFRONT_WIDTH_AMD failed");
+
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId],
+                                     CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD,
+                                     sizeof(cl_uint), &value, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD failed");
+  CHECK_RESULT((value == 0), "CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD failed");
+}
+
+static void CL_CALLBACK notify_callback(cl_event event,
+                                        cl_int event_command_exec_status,
+                                        void* user_data) {}
+
+void OCLDeviceQueries::run(void) {
+  if (failed_) {
+    return;
+  }
+}
+
+unsigned int OCLDeviceQueries::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.h
new file mode 100644
index 0000000000..db6896a6f7
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_DEVICE_QUERIES_H_
+#define _OCL_DEVICE_QUERIES_H_
+
+#include "OCLTestImp.h"
+
+class OCLDeviceQueries : public OCLTestImp {
+ public:
+  OCLDeviceQueries();
+  virtual ~OCLDeviceQueries();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+};
+
+#endif  // _OCL_DEVICE_QUERIES_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamic.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamic.cpp
new file mode 100644
index 0000000000..372919ad13
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamic.cpp
@@ -0,0 +1,225 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLDynamic.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+static const cl_uint TotalElements = 128;
+static cl_uint hostArray[TotalElements];
+
+#define KERNEL_CODE(...) #__VA_ARGS__
+
+const static char* strKernel[] = {
+    KERNEL_CODE(
+    \n void block_fn(int tid, int mul, __global uint* res) {
+      res[tid] = mul * 7 - 21;
+    }
+
+        __kernel void dynamic(__global uint* res) {
+          int multiplier = 3;
+          int tid = get_global_id(0);
+
+          void (^kernelBlock)(void) = ^{
+            block_fn(tid, multiplier, res);
+          };
+
+          res[tid] = -1;
+          queue_t def_q = get_default_queue();
+          ndrange_t ndrange = ndrange_1D(1);
+          int enq_res;
+          do {
+            enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange,
+                                     kernelBlock);
+            if (enq_res != 0 /*CL_SUCCESS*/) {
+              res[tid] = -2;
+            }
+          } while (enq_res != 0);
+        }
+    \n),
+    KERNEL_CODE(
+    \n void block_fn(int tid, int mul, __global uint* res) {
+      res[tid] = mul * 7 - 21;
+    }
+
+        __kernel void dynamic(__global uint* res, queue_t def_q) {
+          int multiplier = 3;
+          int tid = get_global_id(0);
+
+          void (^kernelBlock)(void) = ^{
+            block_fn(tid, multiplier, res);
+          };
+
+          res[tid] = -1;
+          ndrange_t ndrange = ndrange_1D(1);
+          // if (tid == 0) {
+          int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL,
+                                       ndrange, kernelBlock);
+          if (enq_res != 0 /*CL_SUCCESS*/) {
+            res[tid] = -2;
+            return;
+          }
+          //}
+        }
+    \n)};
+
+OCLDynamic::OCLDynamic() {
+  _numSubTests = 2;
+  deviceQueue_ = NULL;
+  failed_ = false;
+}
+
+OCLDynamic::~OCLDynamic() {}
+
+void OCLDynamic::open(unsigned int test, char* units, double& conversion,
+                      unsigned int deviceId) {
+  // FIXME: Re-enable CPU test once bug 10143 is fixed.
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  testID_ = test;
+
+  size_t param_size = 0;
+  char* strVersion = 0;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
+                                     0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strVersion = new char[param_size];
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
+                                     param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strVersion[7] < '2') {
+    failed_ = true;
+    return;
+  }
+  delete strVersion;
+
+  char dbuffer[1024] = {0};
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel[test],
+                                                 NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "dynamic", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+  memset(hostArray, 0xee, sizeof(hostArray));
+  buffer = _wrapper->clCreateBuffer(
+      context_, CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR, sizeof(hostArray),
+      &hostArray, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+  cl_uint queueSize = (test == 0) ? 1 : 257 * 1024;
+
+#if defined(CL_VERSION_2_0)
+  const cl_queue_properties cprops[] = {
+      CL_QUEUE_PROPERTIES,
+      static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+                                       CL_QUEUE_ON_DEVICE_DEFAULT |
+                                       CL_QUEUE_ON_DEVICE),
+      CL_QUEUE_SIZE, queueSize, 0};
+  deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties(
+      context_, devices_[deviceId], cprops, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateCommandQueueWithProperties() failed");
+#endif
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLDynamic::run(void) {
+  // FIXME: Re-enable CPU test once bug 10143 is fixed.
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  if (failed_) return;
+  cl_mem buffer = buffers()[0];
+
+  size_t gws[1] = {TotalElements};
+  size_t lws[1] = {16};
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  if (testID_ == 1) {
+    error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_command_queue),
+                                      &deviceQueue_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+  }
+
+  size_t offset = 0;
+  size_t region = TotalElements * sizeof(cl_uint);
+
+  cl_uint* host = reinterpret_cast<cl_uint*>(_wrapper->clEnqueueMapBuffer(
+      cmdQueues_[_deviceId], buffer, CL_TRUE, (CL_MAP_READ | CL_MAP_WRITE),
+      offset, region, 0, NULL, NULL, &error_));
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueMapBuffer() failed");
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, lws, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  for (unsigned int i = 0; i < TotalElements; ++i) {
+    if (host[i] != 0) {
+      printf("Bad value: a[%d] = %d\n", i, hostArray[i]);
+      CHECK_RESULT(true, "Incorrect result for dependency!\n");
+    }
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], buffer,
+                                             host, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueUnmapBuffer() failed");
+
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+}
+
+unsigned int OCLDynamic::close(void) {
+  // FIXME: Re-enable CPU test once bug 10143 is fixed.
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return 0;
+  }
+
+  if (NULL != deviceQueue_) {
+    _wrapper->clReleaseCommandQueue(deviceQueue_);
+  }
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamic.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamic.h
new file mode 100644
index 0000000000..f75a40e0cb
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamic.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_DYNAMIC_H_
+#define _OCL_DYNAMIC_H_
+
+#include "OCLTestImp.h"
+
+class OCLDynamic : public OCLTestImp {
+ public:
+  OCLDynamic();
+  virtual ~OCLDynamic();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  cl_command_queue deviceQueue_;
+  bool failed_;
+  unsigned int testID_;
+};
+
+#endif  // _OCL_MEM_DEPENDENCY_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamicBLines.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamicBLines.cpp
new file mode 100644
index 0000000000..0170ee4a84
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamicBLines.cpp
@@ -0,0 +1,357 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLDynamicBLines.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+const static cl_int nLines = 2048;
+const static cl_int blockDim = 64;
+#define MAX_TESSELLATION 64
+
+#define KERNEL_CODE(...) #__VA_ARGS__
+
+const static char* strKernel[] =
+{
+    KERNEL_CODE(
+    \n
+        \x23 define MAX_TESSELLATION 64
+    \n
+        struct BezierLine
+        {
+            float2 CP[3];
+            ulong vertexPos;
+            int nVertices;
+            int reserved;
+        };
+    \n
+        __kernel
+        void computeBezierLinePositions(int lidx, __global struct BezierLine* bLines,
+            int nTessPoints, __global char* buf)
+        {
+            int idx = get_global_id(0);
+            if (idx < nTessPoints) {
+                float u = (float)idx / (float)(nTessPoints-1);
+                float omu = 1.0f - u;
+
+                float B3u[3];
+
+                B3u[0] = omu * omu;
+                B3u[1] = 2.0f * u * omu;
+                B3u[2] = u * u;
+
+                float2 position = {0, 0};
+
+                for (int i = 0; i < 3; i++) {
+                    position = position + B3u[i] * bLines[lidx].CP[i];
+                }
+
+                ((__global float2*)(bLines[lidx].vertexPos))[idx] = position;
+            }
+        }
+    \n
+        __kernel
+        void computeBezierLines(__global struct BezierLine* bLines, int nLines, __global char* buf)
+        {
+            int lidx = get_global_id(0);
+
+            if (lidx < nLines) {
+                float curvature = length(bLines[lidx].CP[1] - 0.5f * (bLines[lidx].CP[0] + bLines[lidx].CP[2])) /
+                    length(bLines[lidx].CP[2] - bLines[lidx].CP[0]);
+                int nTessPoints = min(max((int)(curvature * 16.0f), 4), MAX_TESSELLATION);
+
+                if (bLines[lidx].vertexPos == 0) {
+                    bLines[lidx].nVertices = nTessPoints;
+                    uint value = atomic_add((__global volatile uint*)buf,
+                        nTessPoints * sizeof(float2));
+                    bLines[lidx].vertexPos = (ulong)(&buf[value]);
+                }
+
+                queue_t def_q = get_default_queue();
+                ndrange_t ndrange = ndrange_1D(bLines[lidx].nVertices, 64);
+
+                int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,
+                    ^{ computeBezierLinePositions(lidx, bLines, bLines[lidx].nVertices, buf); });
+            }
+        }
+    \n
+        __kernel
+        void computeBezierLines2(__global struct BezierLine* bLines, int nLines, __global char* buf)
+        {
+            int lidx = get_global_id(0);
+
+            if (lidx < nLines) {
+                float curvature = length(bLines[lidx].CP[1] - 0.5f * (bLines[lidx].CP[0] + bLines[lidx].CP[2])) /
+                    length(bLines[lidx].CP[2] - bLines[lidx].CP[0]);
+                int nTessPoints = min(max((int)(curvature * 16.0f), 4), MAX_TESSELLATION);
+
+                if (bLines[lidx].vertexPos == 0) {
+                    bLines[lidx].nVertices = nTessPoints;
+                    uint value = atomic_add((__global volatile uint*)buf,
+                        nTessPoints * sizeof(float2));
+                    bLines[lidx].vertexPos = (ulong)(&buf[value]);
+                }
+            }
+        }
+    \n
+    )
+};
+
+OCLDynamicBLines::OCLDynamicBLines() {
+  _numSubTests = 1;
+  deviceQueue_ = NULL;
+  failed_ = false;
+  bLines_ = NULL;
+  hostArray_ = NULL;
+  kernel2_ = NULL;
+  kernel3_ = NULL;
+}
+
+OCLDynamicBLines::~OCLDynamicBLines() {}
+
+void OCLDynamicBLines::open(unsigned int test, char* units, double& conversion,
+                            unsigned int deviceId) {
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  testID_ = test;
+
+  size_t param_size = 0;
+  char* strVersion = 0;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
+                                     0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strVersion = new char[param_size];
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
+                                     param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strVersion[7] < '2') {
+    failed_ = true;
+    return;
+  }
+  delete strVersion;
+
+  char dbuffer[1024] = {0};
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel[test],
+                                                 NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "computeBezierLines", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  kernel2_ = _wrapper->clCreateKernel(program_, "computeBezierLines2", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  kernel3_ =
+      _wrapper->clCreateKernel(program_, "computeBezierLinePositions", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+  bLines_ = new BezierLine[nLines];
+
+  cl_float2 last = {0, 0};
+  for (int i = 0; i < nLines; i++) {
+    bLines_[i].CP[0] = last;
+
+    for (int j = 1; j < 3; j++) {
+      bLines_[i].CP[j].s[0] = (float)rand() / (float)RAND_MAX;
+      bLines_[i].CP[j].s[1] = (float)rand() / (float)RAND_MAX;
+    }
+
+    last = bLines_[i].CP[2];
+    bLines_[i].vertexPos = 0;
+    bLines_[i].nVertices = 0;
+    bLines_[i].reserved = 0;
+  }
+
+  buffer =
+      _wrapper->clCreateBuffer(context_, CL_MEM_USE_HOST_PTR,
+                               sizeof(BezierLine) * nLines, bLines_, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+
+  hostArray_ = new cl_float2[nLines * (MAX_TESSELLATION + 1)];
+  ((unsigned int*)hostArray_)[0] = sizeof(cl_float2);
+  buffer = _wrapper->clCreateBuffer(
+      context_, CL_MEM_USE_HOST_PTR,
+      sizeof(cl_float2) * nLines * MAX_TESSELLATION, hostArray_, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+
+  cl_uint queueSize = 256 * 1024;
+#if defined(CL_VERSION_2_0)
+  const cl_queue_properties cprops[] = {
+      CL_QUEUE_PROPERTIES,
+      static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+                                       CL_QUEUE_ON_DEVICE_DEFAULT |
+                                       CL_QUEUE_ON_DEVICE),
+      CL_QUEUE_SIZE, queueSize, 0};
+  deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties(
+      context_, devices_[deviceId], cprops, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateCommandQueueWithProperties() failed");
+#endif
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLDynamicBLines::run(void) {
+  CPerfCounter timer;
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return;
+  }
+
+  if (failed_) return;
+
+  cl_mem buffer = buffers()[0];
+  cl_mem alloc = buffers()[1];
+
+  size_t gws[1] = {nLines};
+  size_t lws[1] = {blockDim};
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+  error_ |= _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_int), &nLines);
+  error_ |= _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_mem), &alloc);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, lws, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  for (int i = 0; i < nLines; i++) {
+    bLines_[i].vertexPos = 0;
+    bLines_[i].nVertices = 0;
+    bLines_[i].reserved = 0;
+  }
+  ((unsigned int*)hostArray_)[0] = sizeof(cl_float2);
+
+  timer.Reset();
+  timer.Start();
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, lws, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  timer.Stop();
+  double sec = timer.GetElapsedTime();
+
+  for (int i = 0; i < nLines; i++) {
+    bLines_[i].vertexPos = 0;
+    bLines_[i].nVertices = 0;
+    bLines_[i].reserved = 0;
+  }
+  unsigned int allocSize = ((unsigned int*)hostArray_)[0];
+  ((unsigned int*)hostArray_)[0] = sizeof(cl_float2);
+
+  //
+  // Host emulation
+  //
+  timer.Reset();
+  timer.Start();
+  // Step 1. Fill the jobs
+  error_ = _wrapper->clSetKernelArg(kernel2_, 0, sizeof(cl_mem), &buffer);
+  error_ |= _wrapper->clSetKernelArg(kernel2_, 1, sizeof(cl_int), &nLines);
+  error_ |= _wrapper->clSetKernelArg(kernel2_, 2, sizeof(cl_mem), &alloc);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel2_, 1,
+                                            NULL, gws, lws, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  // Step 2. Run all jobs
+  for (int lidx = 0; lidx < nLines; lidx++) {
+    // Readback the new dimension.
+    error_ = _wrapper->clSetKernelArg(kernel3_, 0, sizeof(cl_int), &lidx);
+    error_ |= _wrapper->clSetKernelArg(kernel3_, 1, sizeof(cl_mem), &buffer);
+    error_ |= _wrapper->clSetKernelArg(kernel3_, 2, sizeof(cl_int),
+                                       &bLines_[lidx].nVertices);
+    error_ |= _wrapper->clSetKernelArg(kernel3_, 3, sizeof(cl_mem), &alloc);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+    size_t gwsL[1] = {static_cast<size_t>(bLines_[lidx].nVertices)};
+    size_t lwsL[1] = {blockDim};
+
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel3_,
+                                              1, NULL, gws, lws, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  }
+
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  timer.Stop();
+  double sec2 = timer.GetElapsedTime();
+
+  if (memcmp(&allocSize, hostArray_, sizeof(cl_uint)) != 0) {
+    CHECK_RESULT(true, "Validaiton failed!");
+  }
+
+  if (sec >= sec2) {
+    _perfInfo = (float)(sec2 - sec);
+    CHECK_RESULT(true, "Device enqueue is slower than emulation (sec)");
+    return;
+  }
+
+  _perfInfo = (float)(((sec2 - sec) / sec) * 100);
+  testDescString = "Device enqueue is (%%) faster";
+}
+
+unsigned int OCLDynamicBLines::close(void) {
+  // FIXME: Re-enable CPU test once bug 10143 is fixed.
+  if (type_ == CL_DEVICE_TYPE_CPU) {
+    return 0;
+  }
+
+  delete[] bLines_;
+  delete[] hostArray_;
+
+  if (NULL != deviceQueue_) {
+    _wrapper->clReleaseCommandQueue(deviceQueue_);
+  }
+  if (NULL != kernel2_) {
+    _wrapper->clReleaseKernel(kernel2_);
+  }
+  if (NULL != kernel3_) {
+    _wrapper->clReleaseKernel(kernel3_);
+  }
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamicBLines.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamicBLines.h
new file mode 100644
index 0000000000..bbb9386c4b
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamicBLines.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_DYNAMIC_BLINES_H_
+#define _OCL_DYNAMIC_BLINES_H_
+
+#include "OCLTestImp.h"
+
+class OCLDynamicBLines : public OCLTestImp {
+ public:
+  OCLDynamicBLines();
+  virtual ~OCLDynamicBLines();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  struct BezierLine {
+    cl_float2 CP[3];
+    long long vertexPos;
+    int nVertices;
+    int reserved;
+  };
+
+  cl_command_queue deviceQueue_;
+  bool failed_;
+  unsigned int testID_;
+  BezierLine* bLines_;
+  cl_float2* hostArray_;
+  cl_kernel kernel2_;
+  cl_kernel kernel3_;
+};
+
+#endif  // _OCL_DYNAMIC_BLINES__H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLGenericAddressSpace.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGenericAddressSpace.cpp
new file mode 100644
index 0000000000..fd09049132
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGenericAddressSpace.cpp
@@ -0,0 +1,815 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGenericAddressSpace.h"
+
+#include "CL/cl.h"
+
+#define TO_LOCAL_FAIL 0x000f0
+#define TO_GLOBAL_FAIL 0x00e00
+#define TO_PRIVATE_FAIL 0x0d000
+#define WRONG_VALUE 0xc0000
+
+OCLGenericAddressSpace::OCLGenericAddressSpace() { _numSubTests = 7; }
+
+OCLGenericAddressSpace::~OCLGenericAddressSpace() {}
+
+void OCLGenericAddressSpace::open(unsigned int test, char* units,
+                                  double& conversion, unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "error_ opening test");
+  silentFailure = false;
+  _openTest = test;
+  size_t param_size = 0;
+  program_ = 0;
+  kernel_ = 0;
+  char* strVersion = 0;
+  arrSize = 1000;
+  error_ = _wrapper->clGetDeviceInfo(
+      devices_[_deviceId], CL_DEVICE_OPENCL_C_VERSION, 0, 0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+  strVersion = (char*)malloc(param_size);
+  error_ =
+      _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_OPENCL_C_VERSION,
+                                param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+  if (strVersion[9] < '2') {
+    printf("\nOpenCL C 2.0 not supported\n");
+    silentFailure = true;
+  }
+  free(strVersion);
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLGenericAddressSpace::run(void) {
+  if (silentFailure) return;
+  switch (_openTest) {
+    case 0:
+      test0();
+      break;
+    case 1:
+      test1();
+      break;
+    case 2:
+      test2();
+      break;
+    case 3:
+      test3();
+      break;
+    case 4:
+      test4();
+      break;
+    case 5:
+      test5();
+      break;
+    case 6:
+      test6();
+      break;
+  }
+  return;
+}
+
+void OCLGenericAddressSpace::test6(void) {
+  const char* kernel_str =
+      "\n\
+        __global unsigned int gint = 1; \n\
+        __kernel void test(__global ulong *results) \n\
+        { \n\
+            uint tid = get_global_id(0); \n\
+            unsigned int *ptr; \n\
+            __private unsigned int pint = tid + 2; \n\
+            if ((tid % 2) == 0) { \n\
+                ptr = &pint; \n\
+            } \n\
+            else { \n\
+                ptr = &gint; \n\
+            } \n\
+            results[0] = *ptr;\n\
+            results[1] = pint;\n\
+            results[2] = ptr;\n\
+            results[3] = to_private(ptr);\n\
+            results[4] = &pint;\n\
+        } \n";
+  const size_t global_work_size = 1;
+  const size_t arrSize = global_work_size * 5;
+  cl_ulong* output_arr = (cl_ulong*)malloc(arrSize * sizeof(cl_ulong));
+  memset(output_arr, 0, arrSize * sizeof(cl_ulong));
+  cl_mem buffer = _wrapper->clCreateBuffer(
+      context_, CL_MEM_READ_WRITE, arrSize * sizeof(cl_ulong), 0, &error_);
+  buffers_.push_back(buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed");
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char log[400];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 400, log, 0);
+    printf("\n\n%s\n\n", log);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed");
+  kernel_ = _wrapper->clCreateKernel(program_, "test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel failed");
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*)&buffers_[0]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed");
+  cl_event evt;
+
+  error_ =
+      _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL,
+                                       &global_work_size, NULL, 0, NULL, &evt);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0],
+                                         CL_TRUE, 0, sizeof(cl_ulong) * arrSize,
+                                         output_arr, 1, &evt, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed");
+  if (output_arr[0] != 2) {
+    printf(
+        "\n*ptr:0x%llx, pint:0x%llx, ptr:0x%llx, to_private(ptr):0x%llx, "
+        "&pint:0x%llx",
+        (unsigned long long)output_arr[0], (unsigned long long)output_arr[1],
+        (unsigned long long)output_arr[2], (unsigned long long)output_arr[3],
+        (unsigned long long)output_arr[4]);
+    printf("\n\n");
+    error_ = 1;
+  }
+  free(output_arr);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Generic Address Space - test2 failed");
+}
+
+void OCLGenericAddressSpace::test5(void) {
+  const char* kernel_str =
+      "\n\
+        __global unsigned int gint = 1; \n\
+        __kernel void test(__global ulong *results) \n\
+        { \n\
+            uint tid = get_global_id(0); \n\
+            results[tid] = 0; \n\
+            unsigned int *ptr; \n\
+            __local unsigned int lint; \n\
+            lint = 2; \n\
+            if ((tid % 2) == 0) { \n\
+                ptr = &lint; \n\
+            } \n\
+            else { \n\
+                ptr = &gint; \n\
+            } \n\
+            barrier(CLK_GLOBAL_MEM_FENCE); \n\
+            if ((tid % 2) == 0) { \n\
+                results[tid*5] = *ptr;\n\
+                results[tid*5+1] = lint;\n\
+                results[tid*5+2] = ptr;\n\
+                results[tid*5+3] = to_local(ptr);\n\
+                results[tid*5+4] = &lint;\n\
+            } \n\
+            else { \n\
+                results[tid*5] = *ptr;\n\
+                results[tid*5+1] = gint;\n\
+                results[tid*5+2] = ptr;\n\
+                results[tid*5+3] = to_global(ptr);\n\
+                results[tid*5+4] = &gint;\n\
+            } \n\
+        } \n";
+  const size_t global_work_size = 2;
+  const size_t arrSize = global_work_size * 5;
+  cl_ulong* output_arr = (cl_ulong*)malloc(arrSize * sizeof(cl_ulong));
+  memset(output_arr, 0, arrSize * sizeof(cl_ulong));
+  cl_mem buffer = _wrapper->clCreateBuffer(
+      context_, CL_MEM_READ_WRITE, arrSize * sizeof(cl_ulong), 0, &error_);
+  buffers_.push_back(buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed");
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char log[400];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 400, log, 0);
+    printf("\n\n%s\n\n", log);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed");
+  kernel_ = _wrapper->clCreateKernel(program_, "test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel failed");
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*)&buffers_[0]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed");
+  cl_event evt;
+
+  error_ =
+      _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL,
+                                       &global_work_size, NULL, 0, NULL, &evt);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0],
+                                         CL_TRUE, 0, sizeof(cl_ulong) * arrSize,
+                                         output_arr, 1, &evt, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed");
+  int error_cnt = 0;
+  for (unsigned int i = 0; i < global_work_size; ++i) {
+    if (((i % 2 == 0) && (output_arr[i * 5] != 2)) ||
+        ((i % 2 == 1) && (output_arr[i * 5] != 1))) {
+      ++error_cnt;
+    }
+  }
+  if (error_cnt) {
+    printf("\nNumber of wrong results: %d/%d\n\n", error_cnt,
+           (int)global_work_size);
+    for (unsigned int i = 0; i < global_work_size; ++i) {
+      if (i % 2 == 0) {
+        printf(
+            "\n*ptr:0x%llx, lint:0x%llx, ptr:0x%llx, to_local(ptr):0x%llx, "
+            "&lint:0x%llx",
+            (unsigned long long)output_arr[i * 5],
+            (unsigned long long)output_arr[i * 5 + 1],
+            (unsigned long long)output_arr[i * 5 + 2],
+            (unsigned long long)output_arr[i * 5 + 3],
+            (unsigned long long)output_arr[i * 5 + 4]);
+      } else {
+        printf(
+            "\n*ptr:0x%llx, gint:0x%llx, ptr:0x%llx, to_global(ptr):0x%llx, "
+            "&gint:0x%llx",
+            (unsigned long long)output_arr[i * 5],
+            (unsigned long long)output_arr[i * 5 + 1],
+            (unsigned long long)output_arr[i * 5 + 2],
+            (unsigned long long)output_arr[i * 5 + 3],
+            (unsigned long long)output_arr[i * 5 + 4]);
+      }
+    }
+    printf("\n\n");
+  }
+  free(output_arr);
+  CHECK_RESULT((error_cnt != 0), "Generic Address Space - test2 failed");
+}
+
+void OCLGenericAddressSpace::test4(void) {
+  const char* kernel_str =
+      "\n\
+        __global unsigned int gint = 1; \n\
+        __kernel void test(__global ulong *results) \n\
+        { \n\
+            uint tid = get_global_id(0); \n\
+            results[tid] = 0; \n\
+            unsigned int *ptr; \n\
+            __private unsigned int pint = 2; \n\
+            if ((tid % 2) == 0) { \n\
+                ptr = &pint; \n\
+            } \n\
+            else { \n\
+                ptr = &gint; \n\
+            } \n\
+            barrier(CLK_GLOBAL_MEM_FENCE); \n\
+            if ((tid % 2) == 0) { \n\
+                results[tid*5] = *ptr;\n\
+                results[tid*5+1] = pint;\n\
+                results[tid*5+2] = ptr;\n\
+                results[tid*5+3] = to_private(ptr);\n\
+                results[tid*5+4] = &pint;\n\
+            } \n\
+            else { \n\
+                results[tid*5] = *ptr;\n\
+                results[tid*5+1] = gint;\n\
+                results[tid*5+2] = ptr;\n\
+                results[tid*5+3] = to_global(ptr);\n\
+                results[tid*5+4] = &gint;\n\
+            } \n\
+        } \n";
+  const size_t global_work_size = 2;
+  const size_t arrSize = global_work_size * 5;
+  cl_ulong* output_arr = (cl_ulong*)malloc(arrSize * sizeof(cl_ulong));
+  memset(output_arr, 0, arrSize * sizeof(cl_ulong));
+  cl_mem buffer = _wrapper->clCreateBuffer(
+      context_, CL_MEM_READ_WRITE, arrSize * sizeof(cl_ulong), 0, &error_);
+  buffers_.push_back(buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed");
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char log[400];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 400, log, 0);
+    printf("\n\n%s\n\n", log);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed");
+  kernel_ = _wrapper->clCreateKernel(program_, "test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel failed");
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*)&buffers_[0]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed");
+  cl_event evt;
+
+  error_ =
+      _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL,
+                                       &global_work_size, NULL, 0, NULL, &evt);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0],
+                                         CL_TRUE, 0, sizeof(cl_ulong) * arrSize,
+                                         output_arr, 1, &evt, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed");
+  int error_cnt = 0;
+  for (unsigned int i = 0; i < global_work_size; ++i) {
+    if (((i % 2 == 0) && (output_arr[i * 5] != 2)) ||
+        ((i % 2 == 1) && (output_arr[i * 5] != 1))) {
+      ++error_cnt;
+    }
+  }
+  if (error_cnt) {
+    printf("\nNumber of wrong results: %d/%d\n\n", error_cnt,
+           (int)global_work_size);
+    for (unsigned int i = 0; i < global_work_size; ++i) {
+      if (i % 2 == 0) {
+        printf(
+            "\n*ptr:0x%llx, pint:0x%llx, ptr:0x%llx, to_private(ptr):0x%llx, "
+            "&pint:0x%llx",
+            (unsigned long long)output_arr[i * 5],
+            (unsigned long long)output_arr[i * 5 + 1],
+            (unsigned long long)output_arr[i * 5 + 2],
+            (unsigned long long)output_arr[i * 5 + 3],
+            (unsigned long long)output_arr[i * 5 + 4]);
+      } else {
+        printf(
+            "\n*ptr:0x%llx, gint:0x%llx, ptr:0x%llx, to_global(ptr):0x%llx, "
+            "&gint:0x%llx",
+            (unsigned long long)output_arr[i * 5],
+            (unsigned long long)output_arr[i * 5 + 1],
+            (unsigned long long)output_arr[i * 5 + 2],
+            (unsigned long long)output_arr[i * 5 + 3],
+            (unsigned long long)output_arr[i * 5 + 4]);
+      }
+    }
+    printf("\n\n");
+  }
+  free(output_arr);
+  CHECK_RESULT((error_cnt != 0), "Generic Address Space - test2 failed");
+}
+
+void OCLGenericAddressSpace::test3(void) {
+  const char* kernel_str =
+      "\n\
+        #define TO_LOCAL_FAIL   0x000f0\n\
+        #define TO_GLOBAL_FAIL  0x00e00\n\
+        #define TO_PRIVATE_FAIL 0x0d000\n\
+        #define WRONG_VALUE     0xc0000\n\
+        __global unsigned int gint = 1; \n\
+        __kernel void test(__global uint *results) \n\
+        { \n\
+            uint tid = get_global_id(0); \n\
+            results[tid] = 0; \n\
+            unsigned int *ptr; \n\
+            __local unsigned int lint; \n\
+            lint = 2; \n\
+            __private unsigned int pint = 3; \n\
+            switch (tid % 3) \n\
+            {\n\
+                case 0:\n\
+                    ptr = &gint; break; \n\
+                case 1:\n\
+                    ptr = &lint; break; \n\
+                case 2:\n\
+                    ptr = &pint; break; \n\
+            }\n\
+            barrier(CLK_GLOBAL_MEM_FENCE); \n\
+            switch (tid % 3) \n\
+            {\n\
+                case 0:\n\
+                    if(to_global(ptr) && (*ptr == 1))\n\
+                    {\n\
+                        results[tid] = *ptr;\n\
+                    }\n\
+                    else\n\
+                    {\n\
+                        if (*ptr != 1) results[tid] = WRONG_VALUE;\n\
+                        if(!to_global(ptr)) results[tid] |= TO_GLOBAL_FAIL;\n\
+                    }\n\
+                    break; \n\
+                case 1:\n\
+                    if(to_local(ptr) && (*ptr == 2))\n\
+                    {\n\
+                        results[tid] = *ptr;\n\
+                    }\n\
+                    else\n\
+                    {\n\
+                        if (*ptr != 2) results[tid] = WRONG_VALUE;\n\
+                        if(!to_local(ptr)) results[tid] |= TO_LOCAL_FAIL;\n\
+                    }\n\
+                    break; \n\
+                case 2:\n\
+                    if(to_private(ptr) && (*ptr == 3))\n\
+                    {\n\
+                        results[tid] = *ptr;\n\
+                    }\n\
+                    else\n\
+                    {\n\
+                        if (*ptr != 3) results[tid] = WRONG_VALUE;\n\
+                        if(!to_private(ptr)) results[tid] |= TO_PRIVATE_FAIL;\n\
+                    }\n\
+                    break; \n\
+            }\n\
+        } \n";
+  cl_uint* output_arr = (cl_uint*)malloc(arrSize * sizeof(cl_uint));
+  memset(output_arr, 0, arrSize * sizeof(cl_uint));
+  cl_mem buffer = _wrapper->clCreateBuffer(
+      context_, CL_MEM_READ_WRITE, arrSize * sizeof(cl_uint), 0, &error_);
+  buffers_.push_back(buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed");
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char log[400];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 400, log, 0);
+    printf("\n\n%s\n\n", log);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed");
+  kernel_ = _wrapper->clCreateKernel(program_, "test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel failed");
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*)&buffers_[0]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed");
+  cl_event evt;
+  size_t global_work_size = arrSize;
+  error_ =
+      _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL,
+                                       &global_work_size, NULL, 0, NULL, &evt);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0],
+                                         CL_TRUE, 0, sizeof(cl_uint) * arrSize,
+                                         output_arr, 1, &evt, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed");
+  int error_cnt = 0;
+  int wrong_values = 0;
+  int to_local_error = 0;
+  int to_global_error = 0;
+  int to_private_error = 0;
+  for (unsigned int i = 0; i < arrSize; ++i) {
+    switch (i % 3) {
+      case 0:
+        error_cnt += (output_arr[i] != 1);
+        break;
+      case 1:
+        error_cnt += (output_arr[i] != 2);
+        break;
+      case 2:
+        error_cnt += (output_arr[i] != 3);
+        break;
+    }
+    if (output_arr[i] & WRONG_VALUE) ++wrong_values;
+    if (output_arr[i] & TO_LOCAL_FAIL) ++to_local_error;
+    if (output_arr[i] & TO_GLOBAL_FAIL) ++to_global_error;
+    if (output_arr[i] & TO_PRIVATE_FAIL) ++to_private_error;
+  }
+  if (error_cnt) {
+    printf("\nNumber of wrong results: %d/%d ", error_cnt, (int)arrSize);
+    printf(
+        "wrong values: %d to_local_error: %d, to_global_error: %d, "
+        "to_private_error: %d\n",
+        wrong_values, to_local_error, to_global_error, to_private_error);
+  }
+  free(output_arr);
+  CHECK_RESULT((error_cnt != 0), "Generic Address Space - test3 failed");
+}
+
+void OCLGenericAddressSpace::test2(void) {
+  const char* kernel_str =
+      "\n\
+        #define TO_LOCAL_FAIL   0x000f0\n\
+        #define TO_GLOBAL_FAIL  0x00e00\n\
+        #define TO_PRIVATE_FAIL 0x0d000\n\
+        #define WRONG_VALUE     0xc0000\n\
+        __global unsigned int gint = 1; \n\
+        __kernel void test(__global uint *results) \n\
+        { \n\
+            uint tid = get_global_id(0); \n\
+            results[tid] = 0; \n\
+            unsigned int *ptr; \n\
+            __private unsigned int pint = 2; \n\
+            if ((tid % 2) == 0) { \n\
+                ptr = &pint; \n\
+            } \n\
+            else { \n\
+                ptr = &gint; \n\
+            } \n\
+            barrier(CLK_GLOBAL_MEM_FENCE); \n\
+            if ((tid % 2) == 0) { \n\
+                if (to_private(ptr) && *ptr == 2) {\n\
+                    results[tid] = *ptr;\n\
+                }\n\
+                else {\n\
+                    if (*ptr != 2) results[tid] = WRONG_VALUE;\n\
+                    if(!to_private(ptr)) results[tid] |= TO_PRIVATE_FAIL;\n\
+                }\n\
+            } \n\
+            else { \n\
+                if (to_global(ptr) && *ptr == 1) {\n\
+                    results[tid] = *ptr;\n\
+                }\n\
+                else {\n\
+                    if (*ptr != 1) results[tid] = WRONG_VALUE;\n\
+                    if(!to_global(ptr)) results[tid] |= TO_GLOBAL_FAIL;\n\
+                }\n\
+            } \n\
+        } \n";
+  cl_uint* output_arr = (cl_uint*)malloc(arrSize * sizeof(cl_uint));
+  memset(output_arr, 0, arrSize * sizeof(cl_uint));
+  cl_mem buffer = _wrapper->clCreateBuffer(
+      context_, CL_MEM_READ_WRITE, arrSize * sizeof(cl_uint), 0, &error_);
+  buffers_.push_back(buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed");
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char log[400];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 400, log, 0);
+    printf("\n\n%s\n\n", log);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed");
+  kernel_ = _wrapper->clCreateKernel(program_, "test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel failed");
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*)&buffers_[0]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed");
+  cl_event evt;
+  size_t global_work_size = arrSize;
+  error_ =
+      _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL,
+                                       &global_work_size, NULL, 0, NULL, &evt);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0],
+                                         CL_TRUE, 0, sizeof(cl_uint) * arrSize,
+                                         output_arr, 1, &evt, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed");
+  int error_cnt = 0;
+  int wrong_values = 0;
+  int to_local_error = 0;
+  int to_global_error = 0;
+  int to_private_error = 0;
+
+  for (unsigned int i = 0; i < arrSize; ++i) {
+    if (((i % 2 == 0) && (output_arr[i] != 2)) ||
+        ((i % 2 == 1) && (output_arr[i] != 1))) {
+      if (output_arr[i] & WRONG_VALUE) ++wrong_values;
+      if (output_arr[i] & TO_LOCAL_FAIL) ++to_local_error;
+      if (output_arr[i] & TO_GLOBAL_FAIL) ++to_global_error;
+      if (output_arr[i] & TO_PRIVATE_FAIL) ++to_private_error;
+      ++error_cnt;
+    }
+  }
+  free(output_arr);
+  if (error_cnt) {
+    printf("\nNumber of wrong results: %d/%d", error_cnt, (int)arrSize);
+    printf(
+        "wrong values: %d to_local_error: %d, to_global_error: %d, "
+        "to_private_error: %d\n",
+        wrong_values, to_local_error, to_global_error, to_private_error);
+  }
+  CHECK_RESULT((error_cnt != 0), "Generic Address Space - test2 failed");
+}
+
+void OCLGenericAddressSpace::test1(void) {
+  const char* kernel_str =
+      "\n\
+        #define TO_LOCAL_FAIL   0x000f0\n\
+        #define TO_GLOBAL_FAIL  0x00e00\n\
+        #define TO_PRIVATE_FAIL 0x0d000\n\
+        #define WRONG_VALUE     0xc0000\n\
+        __global unsigned int gint1 = 1; \n\
+        __global unsigned int gint2 = 2; \n\
+        __kernel void test(__global uint *results) \n\
+        { \n\
+            uint tid = get_global_id(0); \n\
+            results[tid] = 0; \n\
+            unsigned int *ptr; \n\
+            if ((tid % 2) == 0) { \n\
+                ptr = &gint2; \n\
+            } \n\
+            else { \n\
+                ptr = &gint1; \n\
+            } \n\
+            barrier(CLK_GLOBAL_MEM_FENCE); \n\
+            if ((tid % 2) == 0) { \n\
+                if (to_global(ptr) && *ptr == 2) {\n\
+                    results[tid] = *ptr;\n\
+                }\n\
+                else {\n\
+                    if (*ptr != 2) results[tid] = WRONG_VALUE;\n\
+                    if(!to_global(ptr)) results[tid] |= TO_GLOBAL_FAIL;\n\
+                }\n\
+            } \n\
+            else { \n\
+                if (to_global(ptr) && *ptr == 1) {\n\
+                    results[tid] = *ptr;\n\
+                }\n\
+                else {\n\
+                    if (*ptr != 1) results[tid] = WRONG_VALUE;\n\
+                    if(!to_global(ptr)) results[tid] |= TO_GLOBAL_FAIL;\n\
+                }\n\
+            } \n\
+        } \n";
+  cl_uint* output_arr = (cl_uint*)malloc(arrSize * sizeof(cl_uint));
+  memset(output_arr, 0, arrSize * sizeof(cl_uint));
+  cl_mem buffer = _wrapper->clCreateBuffer(
+      context_, CL_MEM_READ_WRITE, arrSize * sizeof(cl_uint), 0, &error_);
+  buffers_.push_back(buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed");
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char log[400];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 400, log, 0);
+    printf("\n\n%s\n\n", log);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed");
+  kernel_ = _wrapper->clCreateKernel(program_, "test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel failed");
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*)&buffers_[0]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed");
+  cl_event evt;
+  size_t global_work_size = arrSize;
+  error_ =
+      _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL,
+                                       &global_work_size, NULL, 0, NULL, &evt);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0],
+                                         CL_TRUE, 0, sizeof(cl_uint) * arrSize,
+                                         output_arr, 1, &evt, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed");
+  int error_cnt = 0;
+  int wrong_values = 0;
+  int to_local_error = 0;
+  int to_global_error = 0;
+  int to_private_error = 0;
+
+  for (unsigned int i = 0; i < arrSize; ++i) {
+    if (((i % 2 == 0) && (output_arr[i] != 2)) ||
+        ((i % 2 == 1) && (output_arr[i] != 1))) {
+      if (output_arr[i] & WRONG_VALUE) ++wrong_values;
+      if (output_arr[i] & TO_LOCAL_FAIL) ++to_local_error;
+      if (output_arr[i] & TO_GLOBAL_FAIL) ++to_global_error;
+      if (output_arr[i] & TO_PRIVATE_FAIL) ++to_private_error;
+      ++error_cnt;
+    }
+  }
+  free(output_arr);
+  if (error_cnt) {
+    printf("\nNumber of wrong results: %d/%d", error_cnt, (int)arrSize);
+    printf(
+        "wrong values: %d to_local_error: %d, to_global_error: %d, "
+        "to_private_error: %d\n",
+        wrong_values, to_local_error, to_global_error, to_private_error);
+  }
+  CHECK_RESULT((error_cnt != 0), "Generic Address Space - test1 failed");
+}
+
+void OCLGenericAddressSpace::test0(void) {
+  const char* kernel_str =
+      "\n\
+        #define TO_LOCAL_FAIL   0x000f0\n\
+        #define TO_GLOBAL_FAIL  0x00e00\n\
+        #define TO_PRIVATE_FAIL 0x0d000\n\
+        #define WRONG_VALUE     0xc0000\n\
+        __global unsigned int gint = 1; \n\
+        __kernel void test(__global uint *results) \n\
+        { \n\
+            uint tid = get_global_id(0); \n\
+            results[tid] = 0; \n\
+            unsigned int *ptr; \n\
+            __local unsigned int lint; \n\
+            lint = 2; \n\
+            if ((tid % 2) == 0) { \n\
+                ptr = &lint; \n\
+            } \n\
+            else { \n\
+                ptr = &gint; \n\
+            } \n\
+            barrier(CLK_GLOBAL_MEM_FENCE); \n\
+            if ((tid % 2) == 0) { \n\
+                if (to_local(ptr) && *ptr == 2) {\n\
+                    results[tid] = *ptr;\n\
+                }\n\
+                else {\n\
+                    if (*ptr != 2) results[tid] = WRONG_VALUE;\n\
+                    if(!to_local(ptr)) results[tid] |= TO_LOCAL_FAIL;\n\
+                }\n\
+            } \n\
+            else { \n\
+                if (to_global(ptr) && *ptr == 1) {\n\
+                    results[tid] = *ptr;\n\
+                }\n\
+                else {\n\
+                    if (*ptr != 1) results[tid] = WRONG_VALUE;\n\
+                    if(!to_global(ptr)) results[tid] |= TO_GLOBAL_FAIL;\n\
+                }\n\
+            } \n\
+        } \n";
+  cl_uint* output_arr = (cl_uint*)malloc(arrSize * sizeof(cl_uint));
+  memset(output_arr, 0, arrSize * sizeof(cl_uint));
+  cl_mem buffer = _wrapper->clCreateBuffer(
+      context_, CL_MEM_READ_WRITE, arrSize * sizeof(cl_uint), 0, &error_);
+  buffers_.push_back(buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed");
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char log[400];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 400, log, 0);
+    printf("\n\n%s\n\n", log);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed");
+  kernel_ = _wrapper->clCreateKernel(program_, "test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel failed");
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*)&buffers_[0]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed");
+  cl_event evt;
+  size_t global_work_size = arrSize;
+  error_ =
+      _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL,
+                                       &global_work_size, NULL, 0, NULL, &evt);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0],
+                                         CL_TRUE, 0, sizeof(cl_uint) * arrSize,
+                                         output_arr, 1, &evt, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed");
+  int error_cnt = 0;
+  int wrong_values = 0;
+  int to_local_error = 0;
+  int to_global_error = 0;
+  int to_private_error = 0;
+
+  for (unsigned int i = 0; i < arrSize; ++i) {
+    if (((i % 2 == 0) && (output_arr[i] != 2)) ||
+        ((i % 2 == 1) && (output_arr[i] != 1))) {
+      if (output_arr[i] & WRONG_VALUE) ++wrong_values;
+      if (output_arr[i] & TO_LOCAL_FAIL) ++to_local_error;
+      if (output_arr[i] & TO_GLOBAL_FAIL) ++to_global_error;
+      if (output_arr[i] & TO_PRIVATE_FAIL) ++to_private_error;
+      ++error_cnt;
+    }
+  }
+  free(output_arr);
+  if (error_cnt) {
+    printf("\nNumber of wrong results: %d/%d", error_cnt, (int)arrSize);
+    printf(
+        "wrong values: %d to_local_error: %d, to_global_error: %d, "
+        "to_private_error: %d\n",
+        wrong_values, to_local_error, to_global_error, to_private_error);
+  }
+  CHECK_RESULT((error_cnt != 0), "Generic Address Space - test0 failed");
+}
+
+unsigned int OCLGenericAddressSpace::close(void) {
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+    kernel_ = 0;
+  }
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLGenericAddressSpace.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGenericAddressSpace.h
new file mode 100644
index 0000000000..56aa104f61
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGenericAddressSpace.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GenericAddressSpace_H_
+#define _OCL_GenericAddressSpace_H_
+
+#include "OCLTestImp.h"
+
+class OCLGenericAddressSpace : public OCLTestImp {
+ public:
+  OCLGenericAddressSpace();
+  virtual ~OCLGenericAddressSpace();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  void test0(void);
+  void test1(void);
+  void test2(void);
+  void test3(void);
+  void test4(void);
+  void test5(void);
+  void test6(void);
+  bool silentFailure;
+  cl_kernel kernel_;
+  size_t arrSize;
+};
+
+#endif  // _OCL_GenericAddressSpace_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLGetQueueThreadID.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGetQueueThreadID.cpp
new file mode 100644
index 0000000000..68c5968537
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGetQueueThreadID.cpp
@@ -0,0 +1,116 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGetQueueThreadID.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+#if !defined(ATI_OS_LINUX)
+#include "WinBase.h"
+typedef DWORD(WINAPI* GetThreadId)(__in HANDLE Thread);
+#endif
+bool badThread = false;
+
+OCLGetQueueThreadID::OCLGetQueueThreadID() {
+  _numSubTests = 1;
+  failed_ = false;
+}
+
+OCLGetQueueThreadID::~OCLGetQueueThreadID() {}
+
+void OCLGetQueueThreadID::open(unsigned int test, char* units,
+                               double& conversion, unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  char name[1024] = {0};
+  size_t size = 0;
+
+  if (deviceId >= deviceCount_) {
+    failed_ = true;
+    return;
+  }
+
+  cl_mem buffer;
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                    sizeof(cl_uint), NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+}
+
+static void CL_CALLBACK notify_callback(cl_event event,
+                                        cl_int event_command_exec_status,
+                                        void* user_data) {
+#if defined(ATI_OS_LINUX)
+  pthread_t id = (pthread_t)user_data;
+  pthread_t handle = pthread_self();
+#else
+  HMODULE module = GetModuleHandle("kernel32.dll");
+  GetThreadId getThreadId =
+      reinterpret_cast<GetThreadId>(GetProcAddress(module, "GetThreadId"));
+  if (NULL == getThreadId) {
+    return;
+  }
+  DWORD id = getThreadId((HANDLE)user_data);
+  DWORD handle = GetCurrentThreadId();
+#endif
+  if (id != handle) {
+    badThread = true;
+  }
+}
+
+void OCLGetQueueThreadID::run(void) {
+  if (failed_) {
+    return;
+  }
+  void* handle;
+  cl_event clEvent;
+  cl_event userEvent = clCreateUserEvent(context_, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateUserEvent() failed");
+
+  cl_uint initVal[2] = {5, 10};
+  error_ = _wrapper->clGetCommandQueueInfo(cmdQueues_[_deviceId],
+                                           CL_QUEUE_THREAD_HANDLE_AMD,
+                                           sizeof(void*), &handle, NULL);
+  error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], buffers()[0],
+                                          false, 0, sizeof(cl_uint),
+                                          &initVal[0], 1, &userEvent, &clEvent);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+
+  error_ = _wrapper->clSetEventCallback(clEvent, CL_SUBMITTED, notify_callback,
+                                        handle);
+
+  clSetUserEventStatus(userEvent, CL_COMPLETE);
+
+  clFinish(cmdQueues_[_deviceId]);
+
+  clReleaseEvent(clEvent);
+
+  clReleaseEvent(userEvent);
+
+  CHECK_RESULT(badThread, "Thread ID is incorrect!");
+}
+
+unsigned int OCLGetQueueThreadID::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLGetQueueThreadID.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGetQueueThreadID.h
new file mode 100644
index 0000000000..56a373218f
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGetQueueThreadID.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GET_QUEUE_THREAD_ID_H_
+#define _OCL_GET_QUEUE_THREAD_ID_H_
+
+#include "OCLTestImp.h"
+
+class OCLGetQueueThreadID : public OCLTestImp {
+ public:
+  OCLGetQueueThreadID();
+  virtual ~OCLGetQueueThreadID();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+};
+
+#endif  // _OCL_GET_QUEUE_THREAD_ID_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLGlobalOffset.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGlobalOffset.cpp
new file mode 100644
index 0000000000..efcf482e87
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGlobalOffset.cpp
@@ -0,0 +1,126 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLGlobalOffset.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+const static cl_uint ThreadsForCheck = 2;
+const static cl_uint GlobalOffset = 64;
+
+const static char* strKernel =
+    "__kernel void global_offset_test(                                      \n"
+    "   global uint* out_val)                                               \n"
+    "{                                                                      \n"
+    "   // Check the first thread                                           \n"
+    "   if (get_global_id(0) == get_global_offset(0)) {                     \n"
+    "       out_val[0] = (uint)get_global_offset(0);                        \n"
+    "   }                                                                   \n"
+    "   // Check the last thread                                            \n"
+    "   if (get_global_id(0) == (get_global_size(0) + get_global_offset(0) - "
+    "1)) {  \n"
+    "       out_val[1] = (uint)get_global_offset(0);                        \n"
+    "   }                                                                   \n"
+    "}                                                                      \n";
+
+OCLGlobalOffset::OCLGlobalOffset() { _numSubTests = 1; }
+
+OCLGlobalOffset::~OCLGlobalOffset() {}
+
+void OCLGlobalOffset::open(unsigned int test, char* units, double& conversion,
+                           unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  char dbuffer[1024] = {0};
+  _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_VERSION, 1024, dbuffer,
+                            NULL);
+  if (strstr(dbuffer, "OpenCL 1.0")) {
+    return;
+  }
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "global_offset_test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                    ThreadsForCheck * sizeof(cl_uint), NULL,
+                                    &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLGlobalOffset::run(void) {
+  char dbuffer[1024] = {0};
+  _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 1024,
+                            dbuffer, NULL);
+  if (strstr(dbuffer, "OpenCL 1.0")) {
+    return;
+  }
+  cl_uint offsetValues[ThreadsForCheck] = {0xffffffff, 0xffffffff};
+  cl_mem buffer = buffers()[0];
+  error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], buffer, true,
+                                          0, ThreadsForCheck * sizeof(cl_uint),
+                                          offsetValues, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  size_t gws[1] = {0x0800000};
+  size_t gwo[1] = {GlobalOffset};
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            gwo, gws, NULL, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffer, true, 0,
+                                         ThreadsForCheck * sizeof(cl_uint),
+                                         offsetValues, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed");
+  for (cl_uint i = 0; i < ThreadsForCheck; ++i) {
+    if (offsetValues[i] != GlobalOffset) {
+      printf("%d != %d", GlobalOffset, offsetValues[i]);
+      CHECK_RESULT(true, " - Incorrect result for global offset!\n");
+    }
+  }
+}
+
+unsigned int OCLGlobalOffset::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLGlobalOffset.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGlobalOffset.h
new file mode 100644
index 0000000000..0363e514a4
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGlobalOffset.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_GLOBAL_OFFSET_H_
+#define _OCL_GLOBAL_OFFSET_H_
+
+#include "OCLTestImp.h"
+
+class OCLGlobalOffset : public OCLTestImp {
+ public:
+  OCLGlobalOffset();
+  virtual ~OCLGlobalOffset();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+};
+
+#endif  // _OCL_GLOBAL_OFFSET_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLImage2DFromBuffer.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLImage2DFromBuffer.cpp
new file mode 100644
index 0000000000..afeb0a49f0
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLImage2DFromBuffer.cpp
@@ -0,0 +1,389 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLImage2DFromBuffer.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+#define GROUP_SIZE 256
+const unsigned int OCLImage2DFromBuffer::imageWidth = 1920;
+const unsigned int OCLImage2DFromBuffer::imageHeight = 1080;
+
+const static char strKernel[] =
+    "__constant sampler_t imageSampler = CLK_NORMALIZED_COORDS_FALSE | "
+    "CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \n"
+    "__kernel void image2imageCopy(                                            "
+    "                             \n"
+    "    __read_only image2d_t input,                                          "
+    "                              \n"
+    "    __write_only image2d_t output)                                        "
+    "                              \n"
+    "{                                                                         "
+    "                             \n"
+    "    int2 coord = (int2)(get_global_id(0), get_global_id(1));              "
+    "                              \n"
+    "    uint4 temp = read_imageui(input, imageSampler, coord);                "
+    "                              \n"
+    "    write_imageui(output, coord, temp);                                   "
+    "                              \n"
+    "}                                                                         "
+    "                             \n";
+
+typedef CL_API_ENTRY cl_mem(CL_API_CALL *clConvertImageAMD_fn)(
+    cl_context context, cl_mem image, const cl_image_format *image_format,
+    cl_int *errcode_ret);
+
+clConvertImageAMD_fn clConvertImageAMD;
+
+OCLImage2DFromBuffer::OCLImage2DFromBuffer() : OCLTestImp() {
+  _numSubTests = 6;
+  blockSizeX = GROUP_SIZE;
+  blockSizeY = 1;
+}
+
+OCLImage2DFromBuffer::~OCLImage2DFromBuffer() {}
+
+void OCLImage2DFromBuffer::open(unsigned int test, char *units,
+                                double &conversion, unsigned int deviceId) {
+  buffer = clImage2DOriginal = clImage2D = clImage2DOut = NULL;
+  done = false;
+  pitchAlignment = 0;
+
+  _openTest = test;
+  // Initialize random number seed
+  srand((unsigned int)time(NULL));
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    testDescString = "GPU device is required for this test!\n";
+    done = true;
+    return;
+  }
+
+  if (_openTest >= 4) {
+    clConvertImageAMD =
+        (clConvertImageAMD_fn)clGetExtensionFunctionAddressForPlatform(
+            platform_, "clConvertImageAMD");
+    if (clConvertImageAMD == NULL) {
+      testDescString = "clConvertImageAMD not found!\n";
+      done = true;
+      return;
+    }
+  }
+
+  CompileKernel();
+  AllocateOpenCLImage();
+}
+
+void OCLImage2DFromBuffer::run(void) {
+  if (_errorFlag || done) {
+    return;
+  }
+
+  if ((_openTest % 2) == 0) {
+    testReadImage(clImage2D);
+  } else {
+    testKernel();
+  }
+}
+
+void OCLImage2DFromBuffer::AllocateOpenCLImage() {
+  const bool pitchTest = (_openTest == 2 || _openTest == 3);
+  cl_int status = 0;
+
+  size_t size = 0;
+  pitchAlignment = 0;
+  status = _wrapper->clGetDeviceInfo(devices_[_deviceId],
+                                     CL_DEVICE_IMAGE_PITCH_ALIGNMENT,
+                                     sizeof(cl_uint), &pitchAlignment, &size);
+
+  if (pitchAlignment != 0) {
+    pitchAlignment--;
+  }
+
+  const unsigned int requiredPitch =
+      ((imageWidth + pitchAlignment) & ~pitchAlignment);
+  const unsigned int pitch = (!pitchTest) ? requiredPitch : imageWidth;
+  const size_t bufferSize = pitch * imageHeight;
+  CHECK_RESULT(bufferSize == 0, "ERROR: calculated image size is zero");
+
+  unsigned char *sourceData = new unsigned char[bufferSize];
+
+  // init data
+  for (unsigned int y = 0; y < imageHeight; y++) {
+    for (unsigned int x = 0; x < imageWidth / 4; x++) {
+      for (unsigned int p = 0; p < 4; p++) {
+        *(sourceData + y * pitch + x * 4 + p) = p;
+      }
+    }
+  }
+  buffer = _wrapper->clCreateBuffer(context_,
+                                    CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE,
+                                    bufferSize, sourceData, &status);
+
+  {
+    // testing clConvertImageAMD
+    if (_openTest == 4 || _openTest == 5) {
+      const cl_image_format format = {CL_R, CL_UNSIGNED_INT8};
+#if defined(CL_VERSION_2_0)
+      const cl_image_desc desc = {CL_MEM_OBJECT_IMAGE2D,
+                                  imageWidth,
+                                  imageHeight,
+                                  0,
+                                  0,
+                                  pitch,
+                                  0,
+                                  0,
+                                  0,
+                                  {buffer}};
+#else
+      const cl_image_desc desc = {CL_MEM_OBJECT_IMAGE2D,
+                                  imageWidth,
+                                  imageHeight,
+                                  0,
+                                  0,
+                                  pitch,
+                                  0,
+                                  0,
+                                  0,
+                                  buffer};
+#endif
+      clImage2DOriginal = _wrapper->clCreateImage(
+          context_, CL_MEM_READ_WRITE, &format, &desc, NULL, &status);
+      CHECK_RESULT(status != CL_SUCCESS, "clCreateImage() failed");
+
+      const cl_image_format formatRGBA = {CL_RGBA, CL_UNSIGNED_INT8};
+
+      clImage2D =
+          clConvertImageAMD(context_, clImage2DOriginal, &formatRGBA, &status);
+      CHECK_RESULT(status != CL_SUCCESS, "clConvertImageAMD() failed");
+
+      cl_mem fishyBuffer = 0;
+      status = clGetImageInfo(clImage2D, CL_IMAGE_BUFFER, sizeof(fishyBuffer),
+                              &fishyBuffer, 0);
+      CHECK_RESULT(status != CL_SUCCESS,
+                   "clGetImageInfo(CL_IMAGE_BUFFER) failed");
+      CHECK_RESULT(fishyBuffer != buffer,
+                   "clGetImageInfo() failed, buffer != fishyBuffer");
+    } else {
+      const cl_image_format format = {CL_RGBA, CL_UNSIGNED_INT8};
+#if defined(CL_VERSION_2_0)
+      const cl_image_desc desc = {CL_MEM_OBJECT_IMAGE2D,
+                                  imageWidth / 4,
+                                  imageHeight,
+                                  0,
+                                  0,
+                                  pitch,
+                                  0,
+                                  0,
+                                  0,
+                                  {buffer}};
+#else
+      const cl_image_desc desc = {CL_MEM_OBJECT_IMAGE2D,
+                                  imageWidth / 4,
+                                  imageHeight,
+                                  0,
+                                  0,
+                                  pitch,
+                                  0,
+                                  0,
+                                  0,
+                                  buffer};
+#endif
+
+      clImage2D = _wrapper->clCreateImage(context_, CL_MEM_READ_WRITE, &format,
+                                          &desc, NULL, &status);
+    }
+
+    // testing pitch alignment correct check in the runtime
+    if (pitchTest) {
+      CHECK_RESULT(requiredPitch != pitch &&
+                       (clImage2D != NULL ||
+                        status != CL_INVALID_IMAGE_FORMAT_DESCRIPTOR),
+                   "AllocateOpenCLImage() failed: (clImage2D!=NULL || "
+                   "status!=CL_INVALID_IMAGE_FORMAT_DESCRIPTOR) <=> (%p, %x)",
+                   clImage2D, status);
+      if (requiredPitch != pitch) {
+        done = true;
+        return;
+      }
+    }
+  }
+
+  delete[] sourceData;
+
+  {
+    const cl_image_format format = {CL_RGBA, CL_UNSIGNED_INT8};
+#if defined(CL_VERSION_2_0)
+    const cl_image_desc desc = {CL_MEM_OBJECT_IMAGE2D,
+                                imageWidth / 4,
+                                imageHeight,
+                                0,
+                                0,
+                                0,
+                                0,
+                                0,
+                                0,
+                                {NULL}};
+#else
+    const cl_image_desc desc = {CL_MEM_OBJECT_IMAGE2D,
+                                imageWidth / 4,
+                                imageHeight,
+                                0,
+                                0,
+                                0,
+                                0,
+                                0,
+                                0,
+                                NULL};
+#endif
+    clImage2DOut = _wrapper->clCreateImage(context_, CL_MEM_READ_WRITE, &format,
+                                           &desc, NULL, &status);
+  }
+  CHECK_RESULT(clImage2D == NULL, "AllocateOpenCLImage() failed");
+}
+
+void OCLImage2DFromBuffer::testReadImage(cl_mem image) {
+  cl_int status = 0;
+  size_t bufferSize = imageWidth * imageHeight;
+  unsigned char *dstData = new unsigned char[bufferSize];
+
+  size_t origin[] = {0, 0, 0};
+  size_t region[] = {imageWidth / 4, imageHeight, 1};
+
+  status = clEnqueueReadImage(cmdQueues_[_deviceId], image, 1, origin, region,
+                              0, 0, dstData, 0, 0, 0);
+
+  ::clFinish(cmdQueues_[_deviceId]);
+
+  for (unsigned int y = 0; y < imageHeight; y++) {
+    for (unsigned int x = 0; x < imageWidth / 4; x++) {
+      for (unsigned int p = 0; p < 4; p++) {
+        if (*(dstData + y * imageWidth + x * 4 + p) != p) {
+          CHECK_RESULT(
+              true,
+              "CheckCLImage: *(dstData+y*imageWidth+x*4+p)!=p => %i != %i",
+              *(dstData + y * imageWidth + x * 4 + p), p);
+          goto cleanup;
+        }
+      }
+    }
+  }
+cleanup:
+
+  delete[] dstData;
+}
+
+void OCLImage2DFromBuffer::testKernel() {
+  CopyOpenCLImage(clImage2D);
+
+  testReadImage(clImage2DOut);
+}
+
+unsigned int OCLImage2DFromBuffer::close(void) {
+  if (clImage2DOriginal != NULL) clReleaseMemObject(clImage2DOriginal);
+  if (clImage2D != NULL) clReleaseMemObject(clImage2D);
+  if (clImage2DOut != NULL) clReleaseMemObject(clImage2DOut);
+  if (buffer != NULL) clReleaseMemObject(buffer);
+  return OCLTestImp::close();
+}
+
+void OCLImage2DFromBuffer::CopyOpenCLImage(cl_mem clImageSrc) {
+  cl_int status = 0;
+
+  // Set appropriate arguments to the kernel2D
+
+  // input buffer image
+  status = clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clImageSrc);
+  CHECK_RESULT((status != CL_SUCCESS),
+               "CopyOpenCLImage() failed at "
+               "clSetKernelArg(kernel_,0,sizeof(cl_mem),&clImageSrc)");
+  status = clSetKernelArg(kernel_, 1, sizeof(cl_mem), &clImage2DOut);
+  CHECK_RESULT((status != CL_SUCCESS),
+               "CopyOpenCLImage() failed at "
+               "clSetKernelArg(kernel_,1,sizeof(cl_mem),&clImage2DOut)");
+
+  // Enqueue a kernel run call.
+  size_t global_work_offset[] = {0, 0};
+  size_t globalThreads[] = {imageWidth / 4, imageHeight};
+  size_t localThreads[] = {blockSizeX, blockSizeY};
+
+  status = clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, NULL,
+                                  globalThreads, NULL, 0, NULL, 0);
+  CHECK_RESULT((status != CL_SUCCESS),
+               "CopyOpenCLImage() failed at clEnqueueNDRangeKernel");
+
+  status = clFinish(cmdQueues_[_deviceId]);
+  CHECK_RESULT((status != CL_SUCCESS), "CopyOpenCLImage() failed at clFinish");
+}
+
+void OCLImage2DFromBuffer::CompileKernel() {
+  cl_int status = 0;
+
+  size_t kernelSize = sizeof(strKernel);
+  const char *strs = (const char *)&strKernel[0];
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strs,
+                                                 &kernelSize, &status);
+
+  status = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL,
+                                    NULL, NULL);
+  if (status != CL_SUCCESS) {
+    if (status == CL_BUILD_PROGRAM_FAILURE) {
+      cl_int logStatus;
+      size_t buildLogSize = 0;
+      logStatus = clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                        CL_PROGRAM_BUILD_LOG, buildLogSize,
+                                        NULL, &buildLogSize);
+      std::string buildLog;
+      buildLog.resize(buildLogSize);
+
+      logStatus = clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                        CL_PROGRAM_BUILD_LOG, buildLogSize,
+                                        &buildLog[0], NULL);
+      printf("%s", buildLog.c_str());
+    }
+    return;
+  }
+  // get a kernel object handle for a kernel with the given name
+  kernel_ = _wrapper->clCreateKernel(program_, "image2imageCopy", &status);
+
+  size_t kernel2DWorkGroupSize = 0;
+  status = clGetKernelWorkGroupInfo(kernel_, devices_[_deviceId],
+                                    CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t),
+                                    &kernel2DWorkGroupSize, 0);
+
+  if ((blockSizeX * blockSizeY) > kernel2DWorkGroupSize) {
+    if (blockSizeX > kernel2DWorkGroupSize) {
+      blockSizeX = kernel2DWorkGroupSize;
+      blockSizeY = 1;
+    }
+  }
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLImage2DFromBuffer.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLImage2DFromBuffer.h
new file mode 100644
index 0000000000..0c59b216d7
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLImage2DFromBuffer.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCLImage2DFromBuffer_H_
+#define _OCLImage2DFromBuffer_H_
+
+#include "OCLTestImp.h"
+
+class OCLImage2DFromBuffer : public OCLTestImp {
+ public:
+  OCLImage2DFromBuffer();
+  virtual ~OCLImage2DFromBuffer();
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ protected:
+  static const unsigned int imageWidth;
+  static const unsigned int imageHeight;
+
+  void testReadImage(cl_mem image);
+  void testKernel();
+  void AllocateOpenCLImage();
+  void CopyOpenCLImage(cl_mem clImageSrc);
+  void CompileKernel();
+
+  bool done;
+  size_t blockSizeX; /**< Work-group size in x-direction */
+  size_t blockSizeY; /**< Work-group size in y-direction */
+  cl_mem buffer;
+  cl_mem clImage2DOriginal;
+  cl_mem clImage2D;
+  cl_mem clImage2DOut;
+  cl_uint pitchAlignment;
+};
+
+#endif  // _OCLImage2DFromBuffer_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLImageCopyPartial.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLImageCopyPartial.cpp
new file mode 100644
index 0000000000..534c58ec6a
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLImageCopyPartial.cpp
@@ -0,0 +1,347 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLImageCopyPartial.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/opencl.h"
+#include "Timer.h"
+
+// Quiet pesky warnings
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+#define NUM_SIZES 2
+static const unsigned int Sizes0[NUM_SIZES] = {16384, 16384};
+
+#define NUM_FORMATS 1
+static const cl_image_format formats[NUM_FORMATS] = {{CL_R, CL_UNSIGNED_INT16}};
+static const char *textFormats[NUM_FORMATS] = {"R8"};
+static const unsigned int formatSize[NUM_FORMATS] = {2 * sizeof(cl_uchar)};
+
+static const unsigned int Iterations[2] = {1, OCLImageCopyPartial::NUM_ITER};
+
+#define NUM_SUBTESTS 3
+OCLImageCopyPartial::OCLImageCopyPartial() {
+  _numSubTests = NUM_SIZES * NUM_SUBTESTS * NUM_FORMATS * 2;
+}
+
+OCLImageCopyPartial::~OCLImageCopyPartial() {}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLImageCopyPartial::setData(void *ptr, unsigned int pitch,
+                                  unsigned int size, unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  value = 0;
+  for (unsigned int i = 0; i < size >> 2; i++) {
+    ptr2[i] = value;
+    value++;
+  }
+}
+
+void OCLImageCopyPartial::checkData(void *ptr, unsigned int pitch,
+                                    unsigned int size, unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  value = 0;
+  for (unsigned int i = 0; i < size >> 2; i++) {
+    if (ptr2[i] != value) {
+      printf("Data validation failed at %d!  Got 0x%08x 0x%08x 0x%08x 0x%08x\n",
+             i, ptr2[i], ptr2[i + 1], ptr2[i + 2], ptr2[i + 3]);
+      printf("Expected 0x%08x 0x%08x 0x%08x 0x%08x\n", value, value, value,
+             value);
+      CHECK_RESULT(true, "Data validation failed!");
+      break;
+    }
+    value++;
+  }
+}
+
+void OCLImageCopyPartial::open(unsigned int test, char *units,
+                               double &conversion, unsigned int deviceId) {
+  cl_uint typeOfDevice = type_;
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  size_t queryOut = 0;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  srcBuffer_ = 0;
+  dstBuffer_ = 0;
+  srcImage_ = false;
+  dstImage_ = false;
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice,
+                                      0, NULL, &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    // if (num_devices > 0)
+    //{
+    //    platform = platforms[_platformIndex];
+    //    break;
+    //}
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+
+  bufnum_ = (_openTest / (NUM_SIZES * NUM_SUBTESTS)) % NUM_FORMATS;
+
+  if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) + 1) & 1) {
+    srcImage_ = true;
+  }
+  if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) + 1) & 2) {
+    dstImage_ = true;
+  }
+
+  numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS * NUM_FORMATS)];
+
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices,
+                                    devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH,
+                                     sizeof(size_t), &queryOut, NULL);
+  bufSizeW_ = (cl_uint)queryOut;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT,
+                                     sizeof(size_t), &queryOut, NULL);
+  bufSizeH_ = (cl_uint)queryOut;
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  cl_mem_flags flags = CL_MEM_WRITE_ONLY;
+  void *mem;
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {bufSizeW_, bufSizeH_, 1};
+  size_t image_row_pitch;
+  size_t image_slice_pitch;
+  unsigned int memSize;
+
+  if (_openTest % NUM_SIZES) {
+    origin[0] = bufSizeW_ - 16;
+    region[0] = 16;
+  } else {
+    origin[1] = bufSizeH_ - 16;
+    region[1] = 16;
+  }
+
+  if (dstImage_) {
+    dstBuffer_ =
+        _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], bufSizeW_,
+                                  bufSizeH_, 0, NULL, &error_);
+    CHECK_RESULT(dstBuffer_ == 0, "clCreateImage(dstBuffer) failed");
+    mem = _wrapper->clEnqueueMapImage(
+        cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region,
+        &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapImage failed");
+    memSize = (unsigned int)image_row_pitch * (unsigned int)region[1];
+  } else {
+    dstBuffer_ = _wrapper->clCreateBuffer(
+        context_, flags, region[0] * region[1] * formatSize[bufnum_], NULL,
+        &error_);
+    CHECK_RESULT(dstBuffer_ == 0, "clCreateBuffer(dstBuffer) failed");
+    mem = _wrapper->clEnqueueMapBuffer(
+        cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_WRITE, 0,
+        region[0] * region[1] * formatSize[bufnum_], 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+    memSize =
+        (unsigned int)region[0] * (unsigned int)region[1] * formatSize[bufnum_];
+    image_row_pitch = 0;
+  }
+  unsigned int *ptr2 = (unsigned int *)mem;
+  for (unsigned int i = 0; i < memSize >> 2; i++) {
+    ptr2[i] = 0xdeadbeef;
+  }
+  _wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL);
+
+  flags = CL_MEM_READ_ONLY;
+  if (srcImage_) {
+    srcBuffer_ =
+        _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], bufSizeW_,
+                                  bufSizeH_, 0, NULL, &error_);
+    CHECK_RESULT(srcBuffer_ == 0, "clCreateImage(srcBuffer) failed");
+    mem = _wrapper->clEnqueueMapImage(
+        cmd_queue_, srcBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region,
+        &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapImage failed");
+    memSize = (unsigned int)image_row_pitch * (unsigned int)region[1];
+  } else {
+    srcBuffer_ = _wrapper->clCreateBuffer(
+        context_, flags, region[0] * region[1] * formatSize[bufnum_], NULL,
+        &error_);
+    CHECK_RESULT(srcBuffer_ == 0, "clCreateBuffer(srcBuffer) failed");
+    mem = _wrapper->clEnqueueMapBuffer(
+        cmd_queue_, srcBuffer_, CL_TRUE, CL_MAP_WRITE, 0,
+        region[0] * region[1] * formatSize[bufnum_], 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+    memSize =
+        (unsigned int)region[0] * (unsigned int)region[1] * formatSize[bufnum_];
+    image_row_pitch = 0;
+  }
+  setData(mem, (unsigned int)image_row_pitch, memSize, 0xdeadbeef);
+  _wrapper->clEnqueueUnmapMemObject(cmd_queue_, srcBuffer_, mem, 0, NULL, NULL);
+}
+
+void OCLImageCopyPartial::run(void) {
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {bufSizeW_, bufSizeH_, 1};
+
+  if (_openTest % NUM_SIZES) {
+    origin[0] = bufSizeW_ - 16;
+    region[0] = 16;
+  } else {
+    origin[1] = bufSizeH_ - 16;
+    region[1] = 16;
+  }
+
+  // Warm up
+  if (srcImage_ == false) {
+    error_ = _wrapper->clEnqueueCopyBufferToImage(
+        cmd_queue_, srcBuffer_, dstBuffer_, 0, origin, region, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueCopyBufferToImage failed");
+  } else if (dstImage_ == false) {
+    error_ = _wrapper->clEnqueueCopyImageToBuffer(
+        cmd_queue_, srcBuffer_, dstBuffer_, origin, region, 0, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueCopyImageToBuffer failed");
+  } else {
+    error_ =
+        _wrapper->clEnqueueCopyImage(cmd_queue_, srcBuffer_, dstBuffer_, origin,
+                                     origin, region, 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueCopyImage failed");
+  }
+  error_ = _wrapper->clFinish(cmd_queue_);
+  CHECK_RESULT(error_, "clFinish failed");
+
+  const char *strSrc = NULL;
+  const char *strDst = NULL;
+  if (srcImage_)
+    strSrc = "img";
+  else
+    strSrc = "buf";
+  if (dstImage_)
+    strDst = "img";
+  else
+    strDst = "buf";
+  void *mem;
+  size_t image_row_pitch;
+  size_t image_slice_pitch;
+  unsigned int memSize;
+  if (dstImage_) {
+    mem = _wrapper->clEnqueueMapImage(
+        cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ, origin, region,
+        &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapImage failed");
+    memSize = (unsigned int)image_row_pitch * (unsigned int)region[1];
+  } else {
+    mem = _wrapper->clEnqueueMapBuffer(
+        cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ, 0,
+        region[0] * region[1] * formatSize[bufnum_], 0, NULL, NULL, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+    memSize =
+        (unsigned int)region[0] * (unsigned int)region[1] * formatSize[bufnum_];
+    image_row_pitch = 0;
+  }
+  checkData(mem, (unsigned int)image_row_pitch, memSize, 0x600df00d);
+  _wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL);
+  char buf[256];
+  SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s src:%s dst:%s i: %4d (GB/s) ",
+           bufSizeW_, bufSizeH_, textFormats[bufnum_], strSrc, strDst, numIter);
+  testDescString = buf;
+}
+
+unsigned int OCLImageCopyPartial::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+
+  if (srcBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(srcBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(srcBuffer_) failed");
+  }
+  if (dstBuffer_) {
+    error_ = _wrapper->clReleaseMemObject(dstBuffer_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseMemObject(dstBuffer_) failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLImageCopyPartial.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLImageCopyPartial.h
new file mode 100644
index 0000000000..fbb89f06a9
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLImageCopyPartial.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_ImageCopyCorners_H_
+#define _OCL_ImageCopyCorners_H_
+
+#include "OCLTestImp.h"
+
+class OCLImageCopyPartial : public OCLTestImp {
+ public:
+  OCLImageCopyPartial();
+  virtual ~OCLImageCopyPartial();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  static const unsigned int NUM_ITER = 1;
+
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_mem srcBuffer_;
+  cl_mem dstBuffer_;
+  cl_int error_;
+
+  unsigned int bufSizeW_;
+  unsigned int bufSizeH_;
+  unsigned int bufnum_;
+  bool srcImage_;
+  bool dstImage_;
+  unsigned int numIter;
+  void setData(void* ptr, unsigned int pitch, unsigned int size,
+               unsigned int value);
+  void checkData(void* ptr, unsigned int pitch, unsigned int size,
+                 unsigned int value);
+};
+
+#endif  // _OCL_ImageCopyPartial_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLKernelBinary.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLKernelBinary.cpp
new file mode 100644
index 0000000000..5ae9932a1e
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLKernelBinary.cpp
@@ -0,0 +1,252 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLKernelBinary.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+const static char* strKernel12 =
+    "typedef struct ST {                                \n"
+    "  int i0;                                          \n"
+    "  int i1;                                          \n"
+    "} ST_t;                                            \n"
+    "                                                   \n"
+    "__constant ST_t STCArray[2] = {                    \n"
+    "  { 1, 0 },                                        \n"
+    "  { 2, 1 }                                         \n"
+    "};                                                 \n"
+    "                                                   \n"
+    "__kernel void foo (__global int *p, int n)         \n"
+    "{                                                  \n"
+    "  int s = 0;                                       \n"
+    "  int i;                                           \n"
+    "  for (i=0; i < n; ++i) {                          \n"
+    "    s += STCArray[i].i0 + STCArray[i].i1;          \n"
+    "  }                                                \n"
+    "  *p = s;                                          \n"
+    "}                                                  \n";
+
+const static char* strKernel20 =
+    "typedef struct ST {                                \n"
+    "  int i0;                                          \n"
+    "  int i1;                                          \n"
+    "} ST_t;                                            \n"
+    "                                                   \n"
+    "__constant ST_t STCArray[2] = {                    \n"
+    "  { -1, 0 },                                       \n"
+    "  { 3, -1 }                                        \n"
+    "};                                                 \n"
+    "                                                   \n"
+    "__global int var = 1;                              \n"
+    "                                                   \n"
+    "__kernel void foo (__global int *p, int n)         \n"
+    "{                                                  \n"
+    "  int s = 0;                                       \n"
+    "  int i;                                           \n"
+    "  for (i=0; i < n; ++i) {                          \n"
+    "    s += STCArray[i].i0 + STCArray[i].i1 + var++;  \n"
+    "  }                                                \n"
+    "  p[get_global_id(0)] = s;                         \n"
+    "}                                                  \n";
+
+OCLKernelBinary::OCLKernelBinary() { _numSubTests = 2; }
+
+OCLKernelBinary::~OCLKernelBinary() {}
+
+void OCLKernelBinary::open(unsigned int test, char* units, double& conversion,
+                           unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+
+  char strVersion[128];
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_VERSION,
+                                     sizeof(strVersion), strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  if (test == 1 && strVersion[7] < '2') {
+    program_ = NULL;
+    return;
+  }
+
+  const char *options, *options0;
+  const char* strKernel;
+  switch (test) {
+    case 0:
+      options = "";
+      options0 = "-O0";
+      strKernel = strKernel12;
+      break;
+    case 1:
+      options = "-cl-std=CL2.0";
+      options0 = "-cl-std=CL2.0 -O0";
+      strKernel = strKernel20;
+      break;
+    default:
+      assert(false);
+      return;
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], options,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  size_t* sizes = new size_t[deviceCount_];
+  CHECK_RESULT(((sizes != NULL) ? false : true), "malloc()");
+  size_t* sizes1 = new size_t[deviceCount_];
+  CHECK_RESULT(((sizes1 != NULL) ? false : true), "malloc()");
+  size_t* sizes2 = new size_t[deviceCount_];
+  CHECK_RESULT(((sizes2 != NULL) ? false : true), "malloc()");
+
+  unsigned int programInfoDeviceIdIndex = 0;
+  cl_device_id* programInfoDevices = new cl_device_id[deviceCount_];
+  CHECK_RESULT(((programInfoDevices != NULL) ? false : true), "malloc()");
+  // get an array of device Id's that relate to values order returned by
+  // 'clGetProgramInfo'
+  error_ = _wrapper->clGetProgramInfo(program_, CL_PROGRAM_DEVICES,
+                                      sizeof(cl_device_id) * deviceCount_,
+                                      programInfoDevices, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clGetProgramInfo()");
+  // map between the class devices_ array and the programInfoDeviceId array
+  for (unsigned int i = 0; i < deviceCount_; i++) {
+    if (devices_[deviceId] == programInfoDevices[i]) {
+      programInfoDeviceIdIndex = i;
+    }
+  }
+  delete[] programInfoDevices;
+
+  error_ =
+      _wrapper->clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES,
+                                 sizeof(size_t) * deviceCount_, sizes, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clGetProgramInfo()");
+
+  unsigned char** binaries = new unsigned char*[deviceCount_];
+  CHECK_RESULT(((binaries != NULL) ? false : true), "malloc()");
+
+  for (unsigned int i = 0; i < deviceCount_; i++) {
+    if (sizes[i] > 0) {
+      binaries[i] = new unsigned char[sizes[i]];
+      CHECK_RESULT(((binaries[i] != NULL) ? false : true), "malloc()");
+    } else {
+      binaries[i] = NULL;
+    }
+  }
+
+  error_ = _wrapper->clGetProgramInfo(program_, CL_PROGRAM_BINARIES,
+                                      sizeof(unsigned char*) * deviceCount_,
+                                      binaries, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clGetProgramInfo()");
+
+  error_ = _wrapper->clReleaseProgram(program_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clReleaseProgram()");
+
+  const unsigned char* cBinary = binaries[programInfoDeviceIdIndex];
+  cl_int status;
+  program_ = _wrapper->clCreateProgramWithBinary(
+      context_, 1, &devices_[deviceId], &(sizes[programInfoDeviceIdIndex]),
+      &cBinary, &status, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithBinary()");
+
+  for (unsigned int i = 0; i < deviceCount_; i++) {
+    if (binaries[i] != NULL) delete[] binaries[i];
+  }
+  delete[] binaries;
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], options0,
+                                    NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clGetProgramInfo()");
+
+  error_ =
+      _wrapper->clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES,
+                                 sizeof(size_t) * deviceCount_, sizes1, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "1st clGetProgramInfo()");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "foo", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "1st clCreateKernel() failed");
+
+  _wrapper->clReleaseKernel(kernel_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "1st clReleaseKernel() failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], options0,
+                                    NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clGetProgramInfo()");
+
+  error_ =
+      _wrapper->clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES,
+                                 sizeof(size_t) * deviceCount_, sizes2, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "2nd clGetProgramInfo()");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "foo", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "2nd clCreateKernel() failed");
+
+  cl_mem buffer;
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                    2 * sizeof(cl_uint), NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+
+  delete[] sizes;
+  delete[] sizes1;
+  delete[] sizes2;
+}
+
+void OCLKernelBinary::run(void) {
+  if (program_ == NULL) {
+    return;
+  }
+
+  cl_mem buffer = buffers()[0];
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  cl_int num = 2;
+  error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_int), &num);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  size_t gws[1] = {2};
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, NULL, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+  cl_uint outputV[2] = {0};
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffer, true, 0,
+                                         2 * sizeof(cl_uint), outputV, 0, NULL,
+                                         NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed");
+  if (outputV[0] != 4) {
+    CHECK_RESULT(true, "Incorrect result of kernel execution!");
+  }
+}
+
+unsigned int OCLKernelBinary::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLKernelBinary.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLKernelBinary.h
new file mode 100644
index 0000000000..6453393d76
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLKernelBinary.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_KERNEL_BINARY_H_
+#define _OCL_KERNEL_BINARY_H_
+
+#include "OCLTestImp.h"
+
+class OCLKernelBinary : public OCLTestImp {
+ public:
+  OCLKernelBinary();
+  virtual ~OCLKernelBinary();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+};
+
+#endif  // _OCL_KERNEL_BINARY_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLLDS32K.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLDS32K.cpp
new file mode 100644
index 0000000000..b33c624efd
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLDS32K.cpp
@@ -0,0 +1,371 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLLDS32K.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "CL/cl.h"
+// #include <stdint.h>
+#include <CL/cl.h>
+
+typedef unsigned int uint32_t;
+
+#define LDS_SIZE 32768
+#define LOCAL_WORK_SIZE 64
+
+// We'll do a 64MB transaction
+#define A_SIZE (8 * 1024 * 1024)
+#define B_SIZE A_SIZE
+#define C_SIZE A_SIZE
+#define D_SIZE A_SIZE
+
+#define GLOBAL_WORK_SIZE (A_SIZE / LDS_SIZE * LOCAL_WORK_SIZE)
+
+#define TEST_NAME "lds 32K"
+
+// 32K has 8192 elements
+// 64 threads each handle 8192/64=128 values
+static const char program_source[] = KERNEL(
+    __kernel void the_kernel(__global const uint *a, __global const uint *b,
+                             __global const uint *c, __global uint *d,
+                             __global uint *e) {
+      __local uint lds[8192];
+      uint gid = get_global_id(0);
+      __global const uint *ta = a + 128 * gid;
+      __global const uint *tb = b + 128 * gid;
+      __global const uint *tc = c + 128 * gid;
+      __global uint *td = d + 128 * gid;
+      uint i;
+
+      for (i = 0; i < 128; ++i) lds[ta[i]] = tc[i];
+
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      for (i = 0; i < 128; ++i) td[i] = lds[tb[i]];
+    } __kernel void the_kernel2(__global uint *d) {
+      __local uint lds[8192];
+      uint i;
+      uint gid = get_global_id(0);
+
+      for (i = 0; i < 128; ++i) lds[i] = d[gid];
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      for (i = 0; i < 128; ++i) d[gid] = lds[i];
+    });
+
+static void fill(uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d,
+                 uint32_t *e) {
+  uint32_t i, j, k, t;
+  static uint32_t p[LDS_SIZE / 4];
+  static int is_set = 0;
+
+  if (!is_set) {
+    for (i = 0; i < LDS_SIZE / 4; ++i) p[i] = i;
+    is_set = 1;
+  }
+
+  for (j = 0; j < A_SIZE / LDS_SIZE; ++j) {
+    for (i = 0; i < LDS_SIZE / 4; ++i) {
+      k = rand() % (LDS_SIZE / 4);
+      t = p[i];
+      p[i] = p[k];
+      p[k] = t;
+
+      c[i] = rand();
+    }
+    memcpy(a, p, LDS_SIZE);
+
+    for (i = 0; i < LDS_SIZE / 4; ++i) {
+      k = rand() % (LDS_SIZE / 4);
+      t = p[i];
+      p[i] = p[k];
+      p[k] = t;
+
+      d[i] = 0xfeedbeefU;
+    }
+    memcpy(b, p, LDS_SIZE);
+
+    a += LDS_SIZE / 4;
+    b += LDS_SIZE / 4;
+    c += LDS_SIZE / 4;
+    d += LDS_SIZE / 4;
+  }
+}
+
+static int check(const uint32_t *a, const uint32_t *b, const uint32_t *c,
+                 const uint32_t *d, const uint32_t *e) {
+  uint32_t i, j, t;
+  uint32_t lds[LDS_SIZE / 4];
+
+  for (j = 0; j < A_SIZE / LDS_SIZE; ++j) {
+    for (i = 0; i < LDS_SIZE / 4; ++i) lds[i] = 0xdeadbeef;
+
+    for (i = 0; i < LDS_SIZE / 4; ++i) lds[a[i]] = c[i];
+
+    for (i = 0; i < LDS_SIZE / 4; ++i) {
+      t = lds[b[i]];
+      if (d[i] != t) {
+        printf("mismatch group %u thread %u element %u: %u instead of %u\n", j,
+               i / 128, i % 128, d[i], t);
+        return EXIT_FAILURE;
+      }
+    }
+
+    a += LDS_SIZE / 4;
+    b += LDS_SIZE / 4;
+    c += LDS_SIZE / 4;
+    d += LDS_SIZE / 4;
+  }
+  return EXIT_SUCCESS;
+}
+
+#ifndef E_SIZE
+#define E_SIZE 32
+#endif
+
+void OCLLDS32K::setup_run(const char *cmplr_opt) {
+  cl_ulong lsize;
+  const char *ps[2];
+  error_ =
+      _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_LOCAL_MEM_SIZE,
+                                sizeof(lsize), &lsize, NULL);
+  if (lsize < LDS_SIZE) {
+    fprintf(stderr, "Passed! Test does not support 32kb of lds space!");
+    return;
+  }
+
+  // create the program
+  ps[0] = program_source;
+  program_ =
+      _wrapper->clCreateProgramWithSource(context_, 1, ps, NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed");
+
+  // build the program
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId],
+                                    cmplr_opt, NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char build_log[16384];
+    size_t log_sz;
+    fprintf(stderr, "build program failed, err=%d\n", error_);
+    error_ = _wrapper->clGetProgramBuildInfo(
+        program_, devices_[_deviceId], CL_PROGRAM_BUILD_LOG, sizeof(build_log),
+        build_log, &log_sz);
+    if (error_ != CL_SUCCESS)
+      fprintf(stderr, "failed to get build log, err=%d\n", error_);
+    else
+      fprintf(stderr, "----- Build Log -----\n%s\n----- ----- --- -----\n",
+              build_log);
+    return;
+  }
+
+  // create the kernel
+  kernel_ = _wrapper->clCreateKernel(program_, "the_kernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "create a kernel failed");
+
+  // create the kernel
+  kernel2_ = _wrapper->clCreateKernel(program_, "the_kernel2", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "create a kernel failed");
+
+  // allocate the buffer memory objects
+  a_buf_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, A_SIZE, NULL,
+                                    &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "create a buffer a failed");
+  buffers_.push_back(a_buf_);
+
+  b_buf_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, B_SIZE, NULL,
+                                    &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "create a buffer b failed");
+  buffers_.push_back(b_buf_);
+
+  c_buf_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, C_SIZE, NULL,
+                                    &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "create a buffer c failed");
+  buffers_.push_back(c_buf_);
+
+  d_buf_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, D_SIZE, NULL,
+                                    &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "create a buffer d failed");
+  buffers_.push_back(d_buf_);
+
+  e_buf_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, E_SIZE, NULL,
+                                    &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "create a buffer e failed");
+  buffers_.push_back(e_buf_);
+
+  // set the args values
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&a_buf_);
+  error_ |=
+      _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&b_buf_);
+  error_ |=
+      _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_mem), (void *)&c_buf_);
+  error_ |=
+      _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_mem), (void *)&d_buf_);
+  error_ |=
+      _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_mem), (void *)&e_buf_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "setkernelArg failed!");
+
+  error_ =
+      _wrapper->clSetKernelArg(kernel2_, 0, sizeof(cl_mem), (void *)&d_buf_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "setkernelArg failed!");
+}
+
+void OCLLDS32K::cleanup_run() {
+  if (kernel2_) {
+    _wrapper->clReleaseKernel(kernel2_);
+  }
+}
+
+void OCLLDS32K::exec_kernel(void *a_mem, void *b_mem, void *c_mem, void *d_mem,
+                            void *e_mem) {
+  size_t global_work_size[1];
+  size_t local_work_size[1];
+
+  // Send data to device
+  error_ = _wrapper->clEnqueueWriteBuffer(
+      cmdQueues_[_deviceId], a_buf_, CL_TRUE, 0, A_SIZE, a_mem, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueWritebuffer failed");
+
+  error_ = _wrapper->clEnqueueWriteBuffer(
+      cmdQueues_[_deviceId], b_buf_, CL_TRUE, 0, B_SIZE, b_mem, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueWritebuffer failed");
+
+  error_ = _wrapper->clEnqueueWriteBuffer(
+      cmdQueues_[_deviceId], c_buf_, CL_TRUE, 0, C_SIZE, c_mem, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueWritebuffer failed");
+
+  // set work-item dimensions
+  global_work_size[0] = GLOBAL_WORK_SIZE;
+  local_work_size[0] = LOCAL_WORK_SIZE;
+
+  // execute kernel
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, global_work_size,
+                                            local_work_size, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel failed");
+
+  // execute kernel
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, global_work_size,
+                                            local_work_size, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel failed");
+
+  // execute kernel
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, global_work_size,
+                                            local_work_size, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel failed");
+
+  // read results
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], d_buf_, CL_TRUE,
+                                         0, D_SIZE, d_mem, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed");
+
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], e_buf_, CL_TRUE,
+                                         0, E_SIZE, e_mem, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed");
+
+  error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clFinish failed");
+}
+
+const char *OCLLDS32K::kernel_src = "";
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+OCLLDS32K::OCLLDS32K() { _numSubTests = 1; }
+
+OCLLDS32K::~OCLLDS32K() {}
+
+void OCLLDS32K::open(unsigned int test, char *units, double &conversion,
+                     unsigned int deviceId) {
+  _deviceId = deviceId;
+  testID_ = test;
+  OCLTestImp::open(test, units, conversion, _deviceId);
+}
+
+void OCLLDS32K::run(void) {
+  void *a;
+  void *b;
+  void *c;
+  void *d;
+  void *e;
+  const char *cmplr_opt = NULL;
+  int j, nj;
+  double f, dj, p;
+
+  nj = 5;
+
+  setup_run(cmplr_opt);
+  CHECK_RESULT((error_ != CL_SUCCESS), "setup_run failed!");
+
+  p = 10.0;
+  dj = 100.0 / (double)nj;
+
+  a = malloc(A_SIZE);
+  CHECK_RESULT((a == NULL), "malloc failed");
+  memset(a, 0, A_SIZE);
+
+  b = malloc(B_SIZE);
+  CHECK_RESULT((b == NULL), "malloc failed");
+  memset(b, 0, B_SIZE);
+
+  c = malloc(C_SIZE);
+  CHECK_RESULT((c == NULL), "malloc failed");
+  memset(c, 0, C_SIZE);
+
+  d = malloc(D_SIZE);
+  CHECK_RESULT((d == NULL), "malloc failed");
+  memset(d, 0, D_SIZE);
+
+  e = malloc(E_SIZE);
+  CHECK_RESULT((e == NULL), "malloc failed");
+  memset(e, 0, E_SIZE);
+
+  // printf("Testing " TEST_NAME " on %s\n", argv[1]);
+  for (j = 0; j < nj; ++j) {
+    fill((uint32_t *)a, (uint32_t *)b, (uint32_t *)c, (uint32_t *)d,
+         (uint32_t *)e);
+    // printf("%s Test %d: ", sDevice, j);
+    exec_kernel(a, b, c, d, e);
+    CHECK_RESULT((error_ != CL_SUCCESS), "exec_kernel failed!");
+
+    CHECK_RESULT((check((uint32_t *)a, (uint32_t *)b, (uint32_t *)c,
+                        (uint32_t *)d, (uint32_t *)e) < 0),
+                 " Failed!\n");
+    f = (j + 1) * dj;
+    if (nj > 1 && f >= p) {
+      // printf("%.1lf%%...\n", f);
+      // fflush(stdout);
+      p += 10.0;
+    }
+  }
+}
+
+unsigned int OCLLDS32K::close(void) {
+  cleanup_run();
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLLDS32K.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLDS32K.h
new file mode 100644
index 0000000000..e398e9e615
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLDS32K.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_LDS32K_H_
+#define _OCL_LDS32K_H_
+#include "OCLTestImp.h"
+
+class OCLLDS32K : public OCLTestImp {
+ public:
+  OCLLDS32K();
+  virtual ~OCLLDS32K();
+
+ public:
+  virtual void open(unsigned int test, char *units, double &conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+  void setup_run(const char *cmplr_opt);
+  void cleanup_run();
+  void exec_kernel(void *a_mem, void *b_mem, void *c_mem, void *d_mem,
+                   void *e_mem);
+  static const char *kernel_src;
+  cl_kernel kernel2_;
+
+ private:
+  unsigned int testID_;
+  cl_mem a_buf_;
+  cl_mem b_buf_;
+  cl_mem c_buf_;
+  cl_mem d_buf_;
+  cl_mem e_buf_;
+};
+
+#endif  // _OCL_LDS32K_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLLinearFilter.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLinearFilter.cpp
new file mode 100644
index 0000000000..a9fd35287c
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLinearFilter.cpp
@@ -0,0 +1,187 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLLinearFilter.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+const static size_t ImageSize = 4;
+
+const static char* strKernel =
+    "const sampler_t g_Sampler =    CLK_FILTER_LINEAR |                 \n"
+    "                               CLK_ADDRESS_CLAMP_TO_EDGE |         \n"
+    "                               CLK_NORMALIZED_COORDS_FALSE;        \n"
+    "                                                                   \n"
+    "__kernel void linear3D(__read_only image3d_t img3D, __global float4* "
+    "f4Tata) \n"
+    "{                                                                  \n"
+    "   float4 f4Index = { 2.25f, 1.75f, 0.5f, 0.0f };                  \n"
+    "	// copy interpolated data in result buffer                      \n"
+    "	f4Tata[0] = read_imagef(img3D, g_Sampler, f4Index);             \n"
+    "}                                                                  \n"
+    "                                                                   \n"
+    "__kernel void linear2D(__read_only image2d_t img2D, __global float4* "
+    "f4Tata) \n"
+    "{                                                                  \n"
+    "   float2 f2Index = { 2.25f, 1.75f };                              \n"
+    "	// copy interpolated data in result buffer                      \n"
+    "	f4Tata[0] = read_imagef(img2D, g_Sampler, f2Index);             \n"
+    "}                                                                  \n"
+    "                                                                   \n";
+
+OCLLinearFilter::OCLLinearFilter() { _numSubTests = 2; }
+
+OCLLinearFilter::~OCLLinearFilter() {}
+
+void OCLLinearFilter::open(unsigned int test, char* units, double& conversion,
+                           unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  cl_bool imageSupport;
+  size_t size;
+  for (size_t i = 0; i < deviceCount_; ++i) {
+    _wrapper->clGetDeviceInfo(devices_[i], CL_DEVICE_IMAGE_SUPPORT,
+                              sizeof(imageSupport), &imageSupport, &size);
+    if (!imageSupport) {
+      return;
+    }
+  }
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  const char* kernels[2] = {"linear3D", "linear2D"};
+  kernel_ = _wrapper->clCreateKernel(program_, kernels[test], &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem memory;
+  size_t offset[3] = {0, 0, 0};
+  cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
+
+  if (test == 0) {
+    float data[ImageSize][ImageSize][ImageSize][4];
+    float index = 0.f;
+    size_t region[3] = {ImageSize, ImageSize, ImageSize};
+    for (size_t z = 0; z < ImageSize; ++z) {
+      for (size_t y = 0; y < ImageSize; ++y) {
+        for (size_t x = 0; x < ImageSize; ++x) {
+          data[z][y][x][0] = (float)x;
+          data[z][y][x][1] = (float)y;
+          data[z][y][x][2] = (float)z;
+          data[z][y][x][3] = 1.0f;
+        }
+      }
+    }
+    memory = _wrapper->clCreateImage3D(context_, CL_MEM_READ_ONLY, &imageFormat,
+                                       ImageSize, ImageSize, ImageSize, 0, 0,
+                                       NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateImage() failed");
+
+    error_ = _wrapper->clEnqueueWriteImage(cmdQueues_[_deviceId], memory, true,
+                                           offset, region, 0, 0, data, 0, NULL,
+                                           NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteImage() failed");
+  } else {
+    float data[4][ImageSize][ImageSize];
+    size_t region[3] = {ImageSize, ImageSize, 1};
+    for (size_t y = 0; y < ImageSize; ++y) {
+      for (size_t x = 0; x < ImageSize; ++x) {
+        data[y][x][0] = (float)x;
+        data[y][x][1] = (float)y;
+        data[y][x][2] = data[y][x][3] = 1.0f;
+      }
+    }
+
+    memory = _wrapper->clCreateImage2D(context_, CL_MEM_READ_ONLY, &imageFormat,
+                                       ImageSize, ImageSize, 0, NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateImage() failed");
+    error_ = _wrapper->clEnqueueWriteImage(cmdQueues_[_deviceId], memory, true,
+                                           offset, region, 0, 0, data, 0, NULL,
+                                           NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteImage() failed");
+  }
+  buffers_.push_back(memory);
+
+  memory = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                    4 * sizeof(cl_float), NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(memory);
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLLinearFilter::run(void) {
+  cl_bool imageSupport;
+  size_t size;
+  for (size_t i = 0; i < deviceCount_; ++i) {
+    _wrapper->clGetDeviceInfo(devices_[i], CL_DEVICE_IMAGE_SUPPORT,
+                              sizeof(imageSupport), &imageSupport, &size);
+    if (!imageSupport) {
+      return;
+    }
+  }
+  cl_float values[4] = {0.f, 0.f, 0.f, 0.f};
+  cl_float ref[2] = {1.75f, 1.25f};
+  cl_mem image = buffers()[0];
+  cl_mem buffer = buffers()[1];
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &image);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  size_t gws[1] = {0x1};
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, NULL, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffer, true, 0,
+                                         4 * sizeof(cl_float), values, 0, NULL,
+                                         NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed");
+  for (cl_uint i = 0; i < 2; ++i) {
+    if (values[i] != ref[i]) {
+      printf("%.2f != %.2f [ref]", values[i], ref[i]);
+      CHECK_RESULT(true, " - Incorrect result for linear filtering!\n");
+    }
+  }
+}
+
+unsigned int OCLLinearFilter::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLLinearFilter.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLinearFilter.h
new file mode 100644
index 0000000000..e0b007c5f6
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLinearFilter.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_LINEAR_FILTER_H_
+#define _OCL_LINEAR_FILTER_H_
+
+#include "OCLTestImp.h"
+
+class OCLLinearFilter : public OCLTestImp {
+ public:
+  OCLLinearFilter();
+  virtual ~OCLLinearFilter();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+};
+
+#endif  // _OCL_LINEAR_FILTER_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLLiquidFlash.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLiquidFlash.cpp
new file mode 100644
index 0000000000..f5afad8f42
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLiquidFlash.cpp
@@ -0,0 +1,264 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLLiquidFlash.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <cstdio>
+#include <fstream>
+#include <sstream>
+
+#include "CL/cl.h"
+
+const static size_t ChunkSize = 256 * 1024;
+const static int NumSizes = 5;
+const static int NumChunksArray[NumSizes] = {1, 4, 16, 32, 56};
+const static size_t MaxSubTests = 4 * NumSizes;
+const static char* BinFileName = "LiquidFlash.bin";
+const static int NumIterArray[NumSizes] = {20, 15, 10, 10, 10};
+const static int NumStagesArray[NumSizes] = {2, 2, 4, 4, 4};
+
+OCLLiquidFlash::OCLLiquidFlash() {
+#ifdef CL_VERSION_2_0
+  _numSubTests = MaxSubTests;
+  failed_ = false;
+  maxSize_ = 0;
+  direct_ = false;
+  amdFile_ = NULL;
+#else
+  _numSubTests = 0;
+  failed_ = false;
+  maxSize_ = 0;
+  direct_ = false;
+#endif
+}
+
+OCLLiquidFlash::~OCLLiquidFlash() {}
+
+void OCLLiquidFlash::open(unsigned int test, char* units, double& conversion,
+                          unsigned int deviceId) {
+#ifdef CL_VERSION_2_0
+  failed_ = false;
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  testID_ = test;
+  char name[1024] = {0};
+  size_t size = 0;
+  _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 1024,
+                            name, &size);
+
+  if (!strstr(name, "cl_amd_liquid_flash")) {
+    printf("Liquid flash extension is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+
+  NumChunks = NumChunksArray[testID_ / 4];
+  NumIter = NumIterArray[testID_ / 4];
+  NumStages = NumStagesArray[testID_ / 4];
+  BufferSize = NumChunks * ChunkSize * sizeof(cl_uint);
+  direct_ = ((testID_ % 4) < 3) ? true : false;
+  createFile =
+      (clCreateSsgFileObjectAMD_fn)clGetExtensionFunctionAddressForPlatform(
+          platform_, "clCreateSsgFileObjectAMD");
+  retainFile =
+      (clRetainSsgFileObjectAMD_fn)clGetExtensionFunctionAddressForPlatform(
+          platform_, "clRetainSsgFileObjectAMD");
+  releaseFile =
+      (clReleaseSsgFileObjectAMD_fn)clGetExtensionFunctionAddressForPlatform(
+          platform_, "clReleaseSsgFileObjectAMD");
+  writeBufferFromFile =
+      (clEnqueueReadSsgFileAMD_fn)clGetExtensionFunctionAddressForPlatform(
+          platform_, "clEnqueueReadSsgFileAMD");
+  if (createFile == NULL || retainFile == NULL || releaseFile == NULL ||
+      writeBufferFromFile == NULL) {
+    testDescString = "Failed to initialize LiquidFlash extension!\n";
+    failed_ = true;
+    return;
+  }
+
+  size_t chunkSize = ChunkSize;
+  std::ofstream fs;
+  fs.open(BinFileName, std::fstream::binary);
+
+  if (fs.is_open()) {
+    // allocate memory for file content
+    cl_uint* buffer = new cl_uint[chunkSize];
+    for (cl_uint i = 0; i < chunkSize; ++i) {
+      buffer[i] = i;
+    }
+    for (int i = 0; i < NumChunks; ++i) {
+      fs.write(reinterpret_cast<char*>(buffer), chunkSize * sizeof(cl_uint));
+    }
+    delete[] buffer;
+  }
+  fs.close();
+
+  std::string str(BinFileName);
+  std::wstring wc(str.length(), L' ');
+  // Copy string to wstring.
+  std::copy(str.begin(), str.end(), wc.begin());
+
+  amdFile_ = createFile(context_, CL_FILE_READ_ONLY_AMD, wc.c_str(), &error_);
+  if (error_ != CL_SUCCESS) {
+    printf(
+        "Create file failed. Liquid flash support is required for this "
+        "test!\n");
+    failed_ = true;
+    return;
+  }
+
+  cl_mem buf = NULL;
+  if (direct_) {
+    cl_uint subTest = testID_ % 4;
+    cl_uint memFlags = (subTest == 0)
+                           ? CL_MEM_USE_PERSISTENT_MEM_AMD
+                           : ((subTest == 1) ? CL_MEM_ALLOC_HOST_PTR : 0);
+    buf = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY | memFlags,
+                                   BufferSize, NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "clEnqueueWriteBufferFromFileAMD() failed");
+  } else {
+    for (int i = 0; i < NumStages; ++i) {
+      buf = _wrapper->clCreateBuffer(context_,
+                                     CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                                     BufferSize / NumStages, NULL, &error_);
+      CHECK_RESULT((error_ != CL_SUCCESS),
+                   "clEnqueueWriteBufferFromFileAMD() failed");
+      buffers_.push_back(buf);
+    }
+
+    buf = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, BufferSize,
+                                   NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS),
+                 "clEnqueueWriteBufferFromFileAMD() failed");
+  }
+  buffers_.push_back(buf);
+#endif
+}
+
+void OCLLiquidFlash::run(void) {
+#ifdef CL_VERSION_2_0
+  if (failed_) {
+    return;
+  }
+  size_t finalBuf = (direct_) ? 0 : NumStages;
+
+  cl_uint* buffer = new cl_uint[NumChunks * ChunkSize];
+  size_t iterSize = BufferSize / NumStages;
+  memset(buffer, 0, BufferSize);
+  if (direct_) {
+    error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], buffers_[0],
+                                            CL_TRUE, 0, BufferSize, buffer, 0,
+                                            NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+  } else {
+    for (int i = 0; i < NumStages; ++i) {
+      error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId],
+                                              buffers_[i], CL_TRUE, 0, iterSize,
+                                              buffer, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+    }
+    error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId],
+                                            buffers_[finalBuf], CL_TRUE, 0,
+                                            BufferSize, buffer, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+  }
+
+  CPerfCounter timer;
+
+  double sec = 0.;
+
+  for (int i = 0; i < NumIter; ++i) {
+    timer.Reset();
+    timer.Start();
+    if (direct_) {
+      error_ = writeBufferFromFile(
+          cmdQueues_[_deviceId], buffers_[0], CL_FALSE, 0 /*buffer_offset*/,
+          BufferSize, amdFile_ /*file*/, 0 /*file_offset*/, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "writeBufferFromFile() failed");
+    } else {
+      for (int i = 0; i < NumStages; ++i) {
+        error_ = writeBufferFromFile(
+            cmdQueues_[_deviceId], buffers_[i], CL_FALSE, 0 /*buffer_offset*/,
+            iterSize, amdFile_ /*file*/, iterSize * i /*file_offset*/, 0, NULL,
+            NULL);
+        CHECK_RESULT((error_ != CL_SUCCESS), "writeBufferFromFile() failed");
+
+        error_ = _wrapper->clEnqueueCopyBuffer(
+            cmdQueues_[_deviceId], buffers_[i], buffers_[NumStages], 0,
+            iterSize * i, iterSize, 0, NULL, NULL);
+        CHECK_RESULT((error_ != CL_SUCCESS), "CopyBuffer() failed");
+        _wrapper->clFlush(cmdQueues_[_deviceId]);
+      }
+    }
+    _wrapper->clFinish(cmdQueues_[_deviceId]);
+    timer.Stop();
+    double cur = timer.GetElapsedTime();
+    if (i == 0) {
+      sec = cur;
+    } else {
+      sec = std::min(cur, sec);
+    }
+  }
+
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId],
+                                         buffers_[finalBuf], CL_TRUE, 0,
+                                         BufferSize, buffer, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Validation failed!");
+
+  for (int c = 0; c < NumChunks; ++c) {
+    for (cl_uint i = 0; i < ChunkSize; ++i) {
+      if (buffer[c * ChunkSize + i] != i) {
+        CHECK_RESULT(false, "Validation failed!");
+      }
+    }
+  }
+  delete[] buffer;
+
+  static const char* MemTypeStr[] = {"Visible  ", "Remote   ", "Invisible",
+                                     "Staging"};
+  _perfInfo = (float)BufferSize / ((float)sec * 1024.f * 1024.f);
+  std::stringstream str;
+  str << "WriteBufferFromFile performance (";
+  str << BufferSize / (1024 * 1024);
+  str << " MB of " << MemTypeStr[testID_ % 4] << ") transfer speed (MB/s):";
+  testDescString = str.str();
+#endif
+}
+
+unsigned int OCLLiquidFlash::close(void) {
+#ifdef CL_VERSION_2_0
+  if (!failed_) {
+    if (amdFile_ != NULL) {
+      releaseFile(amdFile_);
+    }
+    if (remove(BinFileName) != 0) {
+    }
+  }
+  return OCLTestImp::close();
+#else
+  return CL_SUCCESS;
+#endif
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLLiquidFlash.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLiquidFlash.h
new file mode 100644
index 0000000000..a44d4ffdb2
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLiquidFlash.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_LIQUID_FLASH_H_
+#define _OCL_LIQUID_FLASH_H_
+
+#include "OCLTestImp.h"
+
+class OCLLiquidFlash : public OCLTestImp {
+ public:
+  OCLLiquidFlash();
+  virtual ~OCLLiquidFlash();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+  unsigned int testID_;
+  cl_ulong maxSize_;
+#ifdef CL_VERSION_2_0
+  cl_file_amd amdFile_;
+#endif
+  bool direct_;
+  size_t BufferSize;
+  int NumChunks;
+  int NumIter;
+  int NumStages;
+#ifdef CL_VERSION_2_0
+  clCreateSsgFileObjectAMD_fn createFile;
+  clRetainSsgFileObjectAMD_fn retainFile;
+  clReleaseSsgFileObjectAMD_fn releaseFile;
+  clEnqueueReadSsgFileAMD_fn writeBufferFromFile;
+#endif
+};
+
+#endif  // _OCL_LIQUID_FLASH_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMapCount.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMapCount.cpp
new file mode 100644
index 0000000000..5746f19b88
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMapCount.cpp
@@ -0,0 +1,98 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLMapCount.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+OCLMapCount::OCLMapCount() { _numSubTests = 1; }
+
+OCLMapCount::~OCLMapCount() {}
+
+void OCLMapCount::open(unsigned int test, char* units, double& conversion,
+                       unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  size_t size;
+  clMemWrapper memObject;
+
+  // Get the address alignment, so we can make sure the sub buffer test later
+  // works properly
+  cl_uint addressAlign;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId],
+                                     CL_DEVICE_MEM_BASE_ADDR_ALIGN,
+                                     sizeof(addressAlign), &addressAlign, NULL);
+  if (addressAlign < 128) addressAlign = 128;
+
+  void* void_buffer = malloc(addressAlign * 4);
+
+  // Create a buffer to test against
+  memObject = _wrapper->clCreateBuffer(context_,
+                                       CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+                                       addressAlign * 4, void_buffer, &error_);
+  if (error_) {
+    free(void_buffer);
+    printf("Unable to create buffer to test");
+  }
+
+  // Map buffer
+  void* mapped = _wrapper->clEnqueueMapBuffer(
+      cmdQueues_[deviceId], memObject, true, CL_MAP_READ, 0, addressAlign * 4,
+      0, NULL, NULL, &error_);
+
+  cl_uint mapCount;
+
+  // Find the number of mappings on buffer after map
+  error_ = _wrapper->clGetMemObjectInfo(memObject, CL_MEM_MAP_COUNT,
+                                        sizeof(mapCount), &mapCount, &size);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Unable to get mem object map count");
+  if (mapCount != 1) {
+    printf(
+        "ERROR: Returned mem object map count does not validate! (expected %d, "
+        "got %d)\n",
+        1, mapCount);
+    return;
+  }
+
+  // Unmap buffer
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[deviceId], memObject,
+                                             mapped, 0, NULL, NULL);
+
+  // Find the number of mappings on buffer after unmap
+  error_ = _wrapper->clGetMemObjectInfo(memObject, CL_MEM_MAP_COUNT,
+                                        sizeof(mapCount), &mapCount, &size);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Unable to get mem object map count");
+  if (mapCount != 0) {
+    printf(
+        "ERROR: Returned mem object map count does not validate! (expected %d, "
+        "got %d)\n",
+        0, mapCount);
+    return;
+  }
+}
+
+void OCLMapCount::run(void) {}
+
+unsigned int OCLMapCount::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMapCount.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMapCount.h
new file mode 100644
index 0000000000..7f3f09e7a8
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMapCount.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_MAP_COUNT_H_
+#define _OCL_MAP_COUNT_H_
+
+#include "OCLTestImp.h"
+
+class OCLMapCount : public OCLTestImp {
+ public:
+  OCLMapCount();
+  virtual ~OCLMapCount();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+};
+
+#endif  // _OCL_MAP_COUNT_H_
+
+class clMemWrapper {
+ public:
+  clMemWrapper() { mMem = NULL; }
+  clMemWrapper(cl_mem mem) { mMem = mem; }
+  ~clMemWrapper() {
+    if (mMem != NULL) clReleaseMemObject(mMem);
+  }
+
+  clMemWrapper& operator=(const cl_mem& rhs) {
+    mMem = rhs;
+    return *this;
+  }
+  operator cl_mem() { return mMem; }
+
+  cl_mem* operator&() { return &mMem; }
+
+  bool operator==(const cl_mem& rhs) { return mMem == rhs; }
+
+ protected:
+  cl_mem mMem;
+};
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.cpp
new file mode 100644
index 0000000000..27d6eec6f4
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.cpp
@@ -0,0 +1,153 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLMemDependency.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+const static cl_uint Stages = 4;
+const static cl_uint ThreadsForCheck = 1 << Stages;
+
+#define KERNEL_CODE(...) #__VA_ARGS__
+
+const static char* strKernel = KERNEL_CODE(
+\n __kernel void bitonicSort(__global uint2* keys, uint stage, uint pass) {
+  const uint thread = get_global_id(0);
+
+  const uint pairDistance = 1 << (stage - pass);
+
+  /* The purpose of this is to introduce an additional zero at stage - pass
+   * bit*/
+  const uint leftID =
+      (thread & (pairDistance - 1)) |
+      ((thread & ~(pairDistance - 1)) << 1); /* Is the same as below */
+
+  const uint direction = ((thread >> stage) & 1) == 1 ? 0 : 1;
+
+  const uint rightID = leftID + pairDistance;
+  const uint2 left = keys[leftID];
+  const uint2 right = keys[rightID];
+
+  const uint2 larger = left.x > right.x ? left : right;
+  const uint2 smaller = left.x > right.x ? right : left;
+
+  keys[leftID] = direction ? smaller : larger;
+  keys[rightID] = direction ? larger : smaller;
+}
+\n);
+
+OCLMemDependency::OCLMemDependency() { _numSubTests = 1; }
+
+OCLMemDependency::~OCLMemDependency() {}
+
+void OCLMemDependency::open(unsigned int test, char* units, double& conversion,
+                            unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  char dbuffer[1024] = {0};
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "bitonicSort", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                    ThreadsForCheck * sizeof(cl_uint2), NULL,
+                                    &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+  cl_buffer_region reg = {0, ThreadsForCheck * sizeof(cl_uint2)};
+  buffer =
+      _wrapper->clCreateSubBuffer(buffers()[0], CL_MEM_READ_WRITE,
+                                  CL_BUFFER_CREATE_TYPE_REGION, &reg, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLMemDependency::run(void) {
+  cl_uint2 values[ThreadsForCheck] = {
+      {{3, 0}},   {{1, 5}},   {{4, 6}},  {{2, 4}}, {{0, 3}},  {{5, 10}},
+      {{15, 7}},  {{13, 8}},  {{10, 2}}, {{9, 1}}, {{7, 11}}, {{11, 9}},
+      {{14, 12}}, {{12, 14}}, {{6, 13}}, {{8, 15}}};
+  cl_uint2 reference[ThreadsForCheck] = {
+      {{0, 3}},   {{1, 5}},   {{3, 0}},  {{2, 4}}, {{4, 6}},  {{5, 10}},
+      {{6, 13}},  {{8, 15}},  {{7, 11}}, {{9, 1}}, {{10, 2}}, {{11, 9}},
+      {{14, 12}}, {{12, 14}}, {{15, 7}}, {{13, 8}}};
+  cl_uint2 results[ThreadsForCheck];
+
+  cl_mem buffer = buffers()[0];
+  error_ =
+      _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], buffer, true, 0,
+                                     sizeof(values), values, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+
+  size_t gws[1] = {ThreadsForCheck};
+
+  for (unsigned int i = 0; i < Stages; ++i) {
+    buffer = buffers()[i % 2];
+    error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+    for (unsigned int j = 0; j < i; ++j) {
+      error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(unsigned int), &i);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+      error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(unsigned int), &j);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+      error_ = _wrapper->clEnqueueNDRangeKernel(
+          cmdQueues_[_deviceId], kernel_, 1, NULL, gws, NULL, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+    }
+  }
+
+  buffer = buffers()[0];
+  error_ =
+      _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffer, true, 0,
+                                    sizeof(results), results, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed");
+  for (unsigned int i = 0; i < ThreadsForCheck; ++i) {
+    if ((results[i].s[0] != reference[i].s[0]) ||
+        (results[i].s[1] != reference[i].s[1])) {
+      CHECK_RESULT(true, "Incorrect result for dependency!\n");
+    }
+  }
+}
+
+unsigned int OCLMemDependency::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.h
new file mode 100644
index 0000000000..2308ae25b8
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_MEM_DEPENDENCY_H_
+#define _OCL_MEM_DEPENDENCY_H_
+
+#include "OCLTestImp.h"
+
+class OCLMemDependency : public OCLTestImp {
+ public:
+  OCLMemDependency();
+  virtual ~OCLMemDependency();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+};
+
+#endif  // _OCL_MEM_DEPENDENCY_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemObjs.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemObjs.cpp
new file mode 100644
index 0000000000..8e4b3122ad
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemObjs.cpp
@@ -0,0 +1,139 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLMemObjs.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <CL/cl.hpp>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+
+const char* OCLMemObjs::kernel_src = "";
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+OCLMemObjs::OCLMemObjs() { _numSubTests = 1; }
+
+OCLMemObjs::~OCLMemObjs() {}
+
+void OCLMemObjs::open(unsigned int test, char* units, double& conversion,
+                      unsigned int deviceId) {
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+}
+
+int OCLMemObjs::test(void) {
+  cl_int err;
+
+  std::vector<cl::Platform> platforms;
+  cl::Platform::get(&platforms);
+  if (platforms.empty()) {
+    std::cerr << "Platform::get() failed \n";
+    return EXIT_FAILURE;
+  }
+  cl_context_properties properties[] = {
+      CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
+  cl::Context context(CL_DEVICE_TYPE_ALL, properties, NULL, NULL, &err);
+  if (err != CL_SUCCESS) {
+    std::cerr << "Context::Context() failed (" << err << ")\n";
+    return EXIT_FAILURE;
+  }
+
+  std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
+  if (err != CL_SUCCESS) {
+    std::cerr << "Context::getInfo() failed (" << err << ")\n";
+    return EXIT_FAILURE;
+  }
+  if (devices.size() == 0) {
+    std::cerr << "No device available\n";
+    return EXIT_FAILURE;
+  }
+
+  const char source[] = "__kernel void test_memobjs(__global int* ptr) {}";
+  cl::Program::Sources sources(1, std::make_pair(source, 0));
+
+  cl::Program program(context, sources, &err);
+  if (err != CL_SUCCESS) {
+    std::cerr << "Program::Program() failed (" << err << ")\n";
+    return EXIT_FAILURE;
+  }
+  err = program.build(devices);
+  if (err != CL_SUCCESS) {
+    std::cerr << "Program::build() failed (" << err << ")\n";
+    return EXIT_FAILURE;
+  }
+
+  cl::Kernel kernel(program, "test_memobjs", &err);
+  if (err != CL_SUCCESS) {
+    std::cerr << "Kernel::Kernel() failed (" << err << ")\n";
+    return EXIT_FAILURE;
+  }
+  if (err != CL_SUCCESS) {
+    std::cerr << "Kernel::setArg() failed (" << err << ")\n";
+    return EXIT_FAILURE;
+  }
+
+  cl::CommandQueue queue(context, devices[0], 0, &err);
+  if (err != CL_SUCCESS) {
+    std::cerr << "CommandQueue::CommandQueue() failed (" << err << ")\n";
+    return EXIT_FAILURE;
+  }
+
+  cl::Buffer buffer(context, (cl_mem_flags)0, 1024, NULL, &err);
+  if (err != CL_SUCCESS) {
+    std::cerr << "Buffer::Buffer() failed (" << err << ")\n";
+    return EXIT_FAILURE;
+  }
+
+  err = kernel.setArg(0, buffer);
+  if (err != CL_SUCCESS) {
+    std::cerr << "Kernel::setArg() failed (" << err << ")\n";
+    return EXIT_FAILURE;
+  }
+
+  err = queue.enqueueTask(kernel);
+  if (err != CL_SUCCESS) {
+    std::cerr << "CommandQueue::enqueueTask() failed (" << err << ")\n";
+  }
+
+  // Force a clReleaseMemoryObject on buffer before dispatch.
+  buffer = cl::Buffer();
+
+  err = queue.finish();
+  if (err != CL_SUCCESS) {
+    std::cerr << "CommandQueue::finish() failed (" << err << ")\n";
+  }
+
+  // std::cout << " Test: Pass!\n";
+  return EXIT_SUCCESS;
+}
+
+void OCLMemObjs::run(void) {
+  CHECK_RESULT((test() != EXIT_SUCCESS), "test failed");
+}
+
+unsigned int OCLMemObjs::close(void) { return _crcword; }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemObjs.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemObjs.h
new file mode 100644
index 0000000000..c3a414eb4b
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemObjs.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_Mem_Objs_H_
+#define _OCL_Mem_Objs_H_
+
+#include "CL/cl.h"
+#include "OCLTestImp.h"
+
+class OCLMemObjs : public OCLTestImp {
+ public:
+  OCLMemObjs();
+  virtual ~OCLMemObjs();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+  int test(void);
+
+  static const char* kernel_src;
+
+ private:
+  cl_int error;
+};
+
+#endif  // _OCL_Mem_Objs_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemoryInfo.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemoryInfo.cpp
new file mode 100644
index 0000000000..bbd3fdc085
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemoryInfo.cpp
@@ -0,0 +1,200 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLMemoryInfo.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+OCLMemoryInfo::OCLMemoryInfo() {
+  // Run the second test with 64 bit only
+  _numSubTests = (sizeof(int*) == 8) ? 2 : 1;
+  failed_ = false;
+}
+
+OCLMemoryInfo::~OCLMemoryInfo() {}
+
+void OCLMemoryInfo::open(unsigned int test, char* units, double& conversion,
+                         unsigned int deviceId) {
+  _deviceId = deviceId;
+  test_ = test;
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    printf("GPU device is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+
+  char name[1024] = {0};
+  size_t size = 0;
+  _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 1024,
+                            name, &size);
+  if (!strstr(name, "cl_amd_device_attribute_query")) {
+    printf("AMD device attribute  extension is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+  // Observed failures with APUs on GSL path due to incorrect available memory,
+  // reported for visible heap
+  cl_bool is_apu = false;
+  error_ = clGetDeviceInfo(devices_[deviceId], CL_DEVICE_HOST_UNIFIED_MEMORY,
+                           sizeof(cl_bool), &is_apu, nullptr);
+  if (is_apu && (test == 1)) {
+    printf("Test not supported for apus, skipping...\n");
+    failed_ = true;
+    return;
+  }
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLMemoryInfo::run(void) {
+  if (failed_) {
+    return;
+  }
+
+  size_t BufSize = 0x1000000;
+  bool succeed = false;
+  bool done = false;
+  if (test_ == 0) {
+    // use multiple loops to make sure the failure case is not caused
+    // by reusing the allocation from the cached memory pool
+    for (int i = 0; i < 5 && !done; i++) {
+      cl_mem buffer;
+      size_t memoryInfo[2];
+      _wrapper->clGetDeviceInfo(devices_[_deviceId],
+                                CL_DEVICE_GLOBAL_FREE_MEMORY_AMD,
+                                2 * sizeof(size_t), memoryInfo, NULL);
+
+      buffer =
+          _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
+                                   BufSize * sizeof(cl_int4), NULL, &error_);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+      buffers_.push_back(buffer);
+
+      unsigned int* values;
+      values = reinterpret_cast<unsigned int*>(new cl_int4[BufSize]);
+
+      // Clear destination buffer
+      memset(values, 0, BufSize * sizeof(cl_int4));
+      error_ = _wrapper->clEnqueueWriteBuffer(
+          cmdQueues_[_deviceId], buffer, CL_TRUE, 0, BufSize * sizeof(cl_int4),
+          values, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+
+      size_t memoryInfo2[2];
+      _wrapper->clGetDeviceInfo(devices_[_deviceId],
+                                CL_DEVICE_GLOBAL_FREE_MEMORY_AMD,
+                                2 * sizeof(size_t), memoryInfo2, NULL);
+
+      size_t dif = memoryInfo[0] - memoryInfo2[0];
+      if (dif == 0) {  // the buffer memory may come from the cached memory pool
+        BufSize *= 2;  // double the size and try again
+      } else if ((dif >=
+                  (static_cast<size_t>(BufSize * sizeof(cl_int4) * 1.5f) /
+                   1024)) ||
+                 (dif <= ((BufSize * sizeof(cl_int4) / 2) / 1024))) {
+        done = true;
+      } else {
+        succeed = true;
+        done = true;
+      }
+
+      delete[] values;
+    }
+  } else {
+    int i = 0;
+    size_t sizeAll;
+    size_t memoryInfo[2];
+    _wrapper->clGetDeviceInfo(devices_[_deviceId],
+                              CL_DEVICE_GLOBAL_FREE_MEMORY_AMD,
+                              2 * sizeof(size_t), memoryInfo, NULL);
+    unsigned int* values;
+    values = reinterpret_cast<unsigned int*>(new cl_int4[BufSize]);
+    memset(values, 0, BufSize * sizeof(cl_int4));
+    // Loop a few times to make sure the results are consistent
+    for (int k = 0; k < 3; ++k) {
+      sizeAll = 0;
+      while (true) {
+        cl_mem buffer;
+
+        buffer =
+            _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
+                                     BufSize * sizeof(cl_int4), NULL, &error_);
+        CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+        buffers_.push_back(buffer);
+
+        // Clear destination buffer
+        error_ = _wrapper->clEnqueueWriteBuffer(
+            cmdQueues_[_deviceId], buffer, CL_TRUE, 0,
+            BufSize * sizeof(cl_int4), values, 0, NULL, NULL);
+        CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+
+        sizeAll += BufSize * sizeof(cl_int4) / 1024;
+        size_t memoryInfo2[2];
+        _wrapper->clGetDeviceInfo(devices_[_deviceId],
+                                  CL_DEVICE_GLOBAL_FREE_MEMORY_AMD,
+                                  2 * sizeof(size_t), memoryInfo2, NULL);
+        if (memoryInfo2[0] < (0x50000 + (BufSize * sizeof(cl_int4) / 1024))) {
+          break;
+        }
+        size_t dif = memoryInfo[0] - memoryInfo2[0];
+        // extra memory could be allocated/destroyed in the driver
+        if ((dif / sizeAll) == 1 || (sizeAll / dif) == 1) {
+          succeed = true;
+        } else {
+          succeed = false;
+          break;
+        }
+        ++i;
+      }
+      for (auto& it : buffers()) {
+        error_ = _wrapper->clReleaseMemObject(it);
+        CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
+                               "clReleaseMemObject() failed");
+      }
+      buffers_.clear();
+      if (!succeed) {
+        break;
+      }
+    }
+    delete[] values;
+  }
+
+  if (!succeed) {
+    CHECK_RESULT(true, "Reported free memory doesn't match allocated size!");
+  }
+}
+
+unsigned int OCLMemoryInfo::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemoryInfo.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemoryInfo.h
new file mode 100644
index 0000000000..8c36d53709
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemoryInfo.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_MEMORY_INFO_H_
+#define _OCL_MEMORY_INFO_H_
+
+#include "OCLTestImp.h"
+
+class OCLMemoryInfo : public OCLTestImp {
+ public:
+  OCLMemoryInfo();
+  virtual ~OCLMemoryInfo();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+  uint32_t test_;
+};
+
+#endif  // _OCL_MEMORY_INFO_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.cpp
new file mode 100644
index 0000000000..743cd45815
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.cpp
@@ -0,0 +1,295 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLMultiQueue.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include <sstream>
+#include <string>
+
+#include "CL/cl.h"
+
+const static char* strKernel =
+    "__kernel void                                     \n"
+    "copyInc(__global uint* dst, __global uint* src)   \n"
+    "{                                                 \n"
+    "    uint index = get_global_id(0);                \n"
+    "                                                  \n"
+    "    dst[index] = src[index] + 1;                  \n"
+    "}                                                 \n";
+
+static bool useGPU = true;
+
+static const cl_uint NumQueues = 8;  // must be power of 2
+static cl_uint NumElements = 4096;
+static const cl_uint NumRuns = 16384;
+static const cl_uint ExecutionsPerQueue = 256;
+std::stringstream lerror;
+
+class MemTransfer {
+ public:
+  MemTransfer(OCLWrapper* wrapper, cl_context context, cl_command_queue queue,
+              cl_uint numElements)
+      : wrapper_(wrapper),
+        context_(context),
+        queue_(queue),
+        numElements_(numElements),
+        count_(0) {}
+
+  ~MemTransfer() {
+    wrapper_->clReleaseMemObject(dst_);
+    wrapper_->clReleaseMemObject(src_);
+  }
+
+  bool create() {
+    cl_int err;
+    size_t size = numElements_ * sizeof(cl_uint);
+    cl_uint* data = new cl_uint[numElements_];
+    memset(data, 0, size);
+
+    src_ = wrapper_->clCreateBuffer(context_, CL_MEM_COPY_HOST_PTR, size, data,
+                                    &err);
+    if (src_ == NULL) {
+      lerror << "clReleaseContext failed";
+      delete[] data;
+      return false;
+    }
+    dst_ = wrapper_->clCreateBuffer(context_, 0, size, NULL, &err);
+    if (dst_ == NULL) {
+      lerror << "clCreateBuffer() failed";
+      delete[] data;
+      return false;
+    }
+
+    delete[] data;
+    return true;
+  }
+
+  bool run(cl_kernel kernel) {
+    size_t global_work_size[1];
+    size_t local_work_size[1];
+    size_t size = numElements_ * sizeof(cl_uint);
+
+    global_work_size[0] = (numElements_ + 63) / 64 * 64;
+    local_work_size[0] = 64;
+
+    if (CL_SUCCESS !=
+        wrapper_->clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&dst_)) {
+      return false;
+    }
+
+    if (CL_SUCCESS !=
+        wrapper_->clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&src_)) {
+      return false;
+    }
+
+    if (CL_SUCCESS != wrapper_->clEnqueueNDRangeKernel(
+                          queue_, kernel, 1, NULL,
+                          (const size_t*)global_work_size,
+                          (const size_t*)local_work_size, 0, NULL, NULL)) {
+      lerror << "clEnqueueNDRangeKernel() failed";
+      return false;
+    }
+
+    // Copy dst into src
+    if (CL_SUCCESS != wrapper_->clEnqueueCopyBuffer(queue_, dst_, src_, 0, 0,
+                                                    size, 0, 0, NULL)) {
+      lerror << "clEnqueueCopyBuffer() failed";
+      return false;
+    }
+    count_++;
+    return true;
+  }
+
+  bool check() {
+    size_t size = numElements_ * sizeof(cl_uint);
+    cl_event event;
+    void* ptr = wrapper_->clEnqueueMapBuffer(queue_, src_, CL_TRUE, CL_MAP_READ,
+                                             0, size, 0, NULL, NULL, NULL);
+    cl_uint* data = reinterpret_cast<cl_uint*>(ptr);
+
+    for (cl_uint i = 0; i < numElements_; ++i) {
+      if (data[i] != count_) {
+        return false;
+      }
+    }
+    wrapper_->clEnqueueUnmapMemObject(queue_, src_, ptr, 0, NULL, &event);
+    wrapper_->clWaitForEvents(1, &event);
+    wrapper_->clReleaseEvent(event);
+    return true;
+  }
+
+  void flush() { wrapper_->clFlush(queue_); }
+
+ private:
+  OCLWrapper* wrapper_;
+  cl_context context_;
+  cl_command_queue queue_;
+  cl_uint numElements_;
+  cl_uint count_;
+  cl_mem dst_;
+  cl_mem src_;
+};
+
+MemTransfer* work[NumQueues];
+
+bool test(cl_kernel, cl_uint, cl_uint);
+
+OCLMultiQueue::OCLMultiQueue() {
+  _numSubTests = 0;
+  for (cl_uint i = 1; i <= NumQueues; i <<= 1, _numSubTests++)
+    ;
+  failed_ = false;
+}
+
+OCLMultiQueue::~OCLMultiQueue() {}
+
+void OCLMultiQueue::open(unsigned int test, char* units, double& conversion,
+                         unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  test_ = test;
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    testDescString = "GPU device is required for this test!\n";
+    failed_ = true;
+    return;
+  }
+  size_t maxWorkGroupSize = 1;
+  cl_uint computePower = 1;
+  error_ = _wrapper->clGetDeviceInfo(
+      devices_[deviceId], CL_DEVICE_MAX_WORK_GROUP_SIZE,
+      sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL);
+  computePower *= static_cast<cl_uint>(maxWorkGroupSize);
+  cl_uint maxComputeUnits = 1;
+  error_ = _wrapper->clGetDeviceInfo(
+      devices_[deviceId], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(maxComputeUnits),
+      &maxComputeUnits, NULL);
+  computePower *= 32 * maxComputeUnits;
+  NumElements = (NumElements < static_cast<size_t>(computePower))
+                    ? static_cast<size_t>(computePower)
+                    : NumElements;
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+  kernel_ = _wrapper->clCreateKernel(program_, "copyInc", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+}
+
+void OCLMultiQueue::run(void) {
+  if (failed_) {
+    return;
+  }
+
+  // Run test
+  cl_uint queues = 1 << test_;
+  if (!test(kernel_, NumRuns / queues, queues)) {
+    lerror << "We failed a test run!";
+    CHECK_RESULT(true, lerror.str().c_str());
+  }
+}
+
+unsigned int OCLMultiQueue::close(void) { return OCLTestImp::close(); }
+
+bool OCLMultiQueue::test(cl_kernel kernel, cl_uint numRuns, cl_uint numQueues) {
+  cl_command_queue cmd_queue[NumQueues];
+  CPerfCounter timer;
+
+  for (cl_uint i = 0; i < numQueues; ++i) {
+    cmd_queue[i] = _wrapper->clCreateCommandQueue(context_, devices_[_deviceId],
+                                                  0, &error_);
+    if (cmd_queue[i] == (cl_command_queue)0) {
+      _wrapper->clReleaseContext(context_);
+      testDescString = "clCreateCommandQueue() failed";
+      return false;
+    }
+    work[i] = new MemTransfer(_wrapper, context_, cmd_queue[i], NumElements);
+    if (work[i] == NULL || !work[i]->create()) {
+      testDescString = "Test creation failed";
+      return false;
+    }
+  }
+
+  timer.Reset();
+  timer.Start();
+
+  cl_uint dispatchCount = ExecutionsPerQueue / numQueues;
+  for (cl_uint i = 0; i < numRuns; ++i) {
+    for (cl_uint j = 0; j < numQueues; ++j) {
+      if (!work[j]->run(kernel)) {
+        testDescString = "Execution failed";
+        return false;
+      }
+      // Every queue should have a dispatch after 256 executions,
+      // but the time for dispatch on each queue
+      // will be shifted on dispatchCount
+      if (((i % dispatchCount) == 0) &&
+          (((i / dispatchCount) % numQueues) == j)) {
+        work[j]->flush();
+      }
+    }
+  }
+
+  for (cl_uint i = 0; i < numQueues; ++i) {
+    _wrapper->clFinish(cmd_queue[i]);
+  }
+
+  timer.Stop();
+
+  for (cl_uint j = 0; j < numQueues; ++j) {
+    if (!work[j]->check()) {
+      testDescString = "Result Check fails!";
+      return false;
+    }
+  }
+  std::stringstream stream;
+
+  stream << "Num Queues: " << numQueues << ", Executions Per Queue: ";
+  stream.flags(std::ios::right | std::ios::showbase);
+  stream.width(5);
+  stream << numRuns;
+  stream.precision(3);
+  stream << ", Time: " << (float)(timer.GetElapsedTime()) << " seconds";
+
+  for (cl_uint i = 0; i < numQueues; ++i) {
+    delete work[i];
+    _wrapper->clReleaseCommandQueue(cmd_queue[i]);
+  }
+  testDescString = stream.str();
+
+  return true;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.h
new file mode 100644
index 0000000000..8b27b878a3
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_MULTI_QUEUE_H_
+#define _OCL_MULTI_QUEUE_H_
+
+#include "OCLTestImp.h"
+
+class OCLMultiQueue : public OCLTestImp {
+ public:
+  OCLMultiQueue();
+  virtual ~OCLMultiQueue();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool test(cl_kernel kernel, cl_uint numRuns, cl_uint numQueues);
+  bool failed_;
+  unsigned int test_;
+};
+
+#endif  // _OCL_ASYNC_TRANSFER_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLOfflineCompilation.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLOfflineCompilation.cpp
new file mode 100644
index 0000000000..44317a3610
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLOfflineCompilation.cpp
@@ -0,0 +1,206 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLOfflineCompilation.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+#include "cl_kernel_info_amd.h"
+
+typedef CL_API_ENTRY cl_int(CL_API_CALL* clGetKernelInfoAMD_fn)(
+    cl_kernel kernel, cl_device_id device, cl_kernel_info_amd param_name,
+    size_t param_value_size, void* param_value, size_t* param_value_size_ret);
+
+clGetKernelInfoAMD_fn clGetKernelInfoAMDp;
+
+#define BLIT_KERNEL(...) #__VA_ARGS__
+
+const char* strKernel12 = BLIT_KERNEL(
+\n const constant uint test = 1; __kernel void factorial(__global uint* out) {
+  uint id = get_global_id(0);
+  uint factorial = 1;
+  out[id] = factorial + test;
+}
+\n);
+
+const char* strKernel20 = BLIT_KERNEL(
+\n const constant uint test = 1; global uint test2 = 0;
+    __kernel void factorial(__global uint* out) {
+      uint id = get_global_id(0);
+      uint factorial = 1;
+      out[id] = factorial + test;
+      if (id == 0) {
+        out[id] += test2++;
+      }
+    }
+\n);
+
+OCLOfflineCompilation::OCLOfflineCompilation() { _numSubTests = 1; }
+
+OCLOfflineCompilation::~OCLOfflineCompilation() {}
+
+void OCLOfflineCompilation::open(unsigned int test, char* units,
+                                 double& conversion, unsigned int deviceId) {
+  size_t nDevices = 0;
+  cl_device_id* devices = NULL;
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  _wrapper->clReleaseContext(context_);
+
+  cl_context_properties cprops[5];
+  clGetKernelInfoAMDp =
+      (clGetKernelInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(
+          platform_, "clGetKernelInfoAMD");
+  if (clGetKernelInfoAMDp == NULL) {
+    testDescString = "clGetKernelInfoAMD not found!\n";
+    return;
+  }
+
+  // Utilize the CL_CONTEXT_OFFLINE_DEVICES_AMD platform option to allow for
+  // the generation of binary kernel without target device installed in build
+  // system.
+  cprops[0] = CL_CONTEXT_PLATFORM;
+  cprops[1] = (cl_context_properties)platform_;
+  cprops[2] = CL_CONTEXT_OFFLINE_DEVICES_AMD;
+  cprops[3] = (cl_context_properties)1;
+  cprops[4] = (cl_context_properties)0;  // end of options list marker
+
+  // Create a context with all of the available devices.
+  context_ = _wrapper->clCreateContextFromType(cprops, CL_DEVICE_TYPE_GPU, NULL,
+                                               NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContextFromType()  failed");
+
+  size_t deviceListSize = 0;
+  error_ = _wrapper->clGetContextInfo(context_, CL_CONTEXT_NUM_DEVICES,
+                                      sizeof(size_t), &deviceListSize, NULL);
+  CHECK_RESULT(((error_ != CL_SUCCESS) || (deviceListSize == 0)),
+               "clGetContextInfo()  failed");
+
+  devices = (cl_device_id*)malloc(sizeof(cl_device_id) * deviceListSize);
+  CHECK_RESULT((devices == NULL), "clGetContextInfo()  failed");
+
+  memset(devices, 0, deviceListSize);
+
+  error_ = _wrapper->clGetContextInfo(context_, CL_CONTEXT_DEVICES,
+                                      sizeof(cl_device_id) * deviceListSize,
+                                      devices, &nDevices);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clGetContextInfo()  failed");
+
+  for (unsigned version = 1; version <= 2; ++version) {
+    std::string options;
+    const char* strKernel;
+
+    switch (version) {
+      case 1:
+        options = "";
+        strKernel = strKernel12;
+        break;
+      case 2:
+        options = "-cl-std=CL2.0";
+        strKernel = strKernel20;
+        break;
+      default:
+        assert(false);
+        return;
+    }
+
+    program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel,
+                                                   NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+    for (unsigned int i = 0; i < deviceListSize; ++i) {
+      char name[128];
+      char strVersion[128];
+      _wrapper->clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(name), name,
+                                NULL);
+      error_ = _wrapper->clGetDeviceInfo(devices[i], CL_DEVICE_VERSION,
+                                         sizeof(strVersion), strVersion, 0);
+      CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+      if (version == 2 && strVersion[7] < '2') {
+        continue;
+      }
+
+      // skipping the test on gfx9+ for now till we add compiler support for al
+      // the gfx10+ subdevices
+      cl_uint gfxip_major = 0;
+      cl_uint gfxip_minor = 0;
+      clGetDeviceInfo(devices[i], CL_DEVICE_GFXIP_MAJOR_AMD,
+                      sizeof(gfxip_major), &gfxip_major, NULL);
+      clGetDeviceInfo(devices[i], CL_DEVICE_GFXIP_MINOR_AMD,
+                      sizeof(gfxip_minor), &gfxip_minor, NULL);
+
+      printf("Building on %s, OpenCL version %s, (options '%s')\n", name,
+             (version == 2 ? "2.0" : "1.2"), options.c_str());
+      error_ = _wrapper->clBuildProgram(program_, 1, &devices[i],
+                                        options.c_str(), NULL, NULL);
+      if (error_ != CL_SUCCESS) {
+        char programLog[1024];
+        _wrapper->clGetProgramBuildInfo(
+            program_, devices[i], CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+        printf("\n%s\n", programLog);
+        fflush(stdout);
+        break;
+      }
+      kernel_ = _wrapper->clCreateKernel(program_, "factorial", &error_);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+      size_t usedVGPRs = 0;
+      error_ =
+          clGetKernelInfoAMDp(kernel_, devices[i], CL_KERNELINFO_USED_VGPRS,
+                              sizeof(usedVGPRs), &usedVGPRs, NULL);
+      CHECK_RESULT(((error_ != CL_SUCCESS) || (usedVGPRs == 0)),
+                   "clGetKernelInfoAMD() failed");
+
+      _wrapper->clReleaseKernel(kernel_);
+      kernel_ = nullptr;
+
+      size_t binSize;
+      error_ = _wrapper->clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES,
+                                          sizeof(size_t), &binSize, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clGetProgramInfo() failed");
+      char* binary = new char[binSize];
+      error_ = _wrapper->clGetProgramInfo(program_, CL_PROGRAM_BINARIES,
+                                          sizeof(char*), &binary, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clGetProgramInfo() failed");
+      delete[] binary;
+    }
+    if (version == 1) {
+      error_ = _wrapper->clReleaseProgram(program_);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clReleaseProgram() failed");
+    }
+  }
+  free(devices);
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLOfflineCompilation::run(void) {}
+
+unsigned int OCLOfflineCompilation::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLOfflineCompilation.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLOfflineCompilation.h
new file mode 100644
index 0000000000..ec8c438309
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLOfflineCompilation.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_OFFLINE_COMPILATION_H_
+#define _OCL_OFFLINE_COMPILATION_H_
+
+#include "OCLTestImp.h"
+
+class OCLOfflineCompilation : public OCLTestImp {
+ public:
+  OCLOfflineCompilation();
+  virtual ~OCLOfflineCompilation();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+};
+
+#endif  // _OCL_OFFLINE_COMPILATION_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLP2PBuffer.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLP2PBuffer.cpp
new file mode 100644
index 0000000000..1cc9127b98
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLP2PBuffer.cpp
@@ -0,0 +1,286 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLP2PBuffer.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <cstdio>
+#include <fstream>
+#include <sstream>
+
+#include "CL/cl.h"
+
+const static size_t ChunkSize = 256 * 1024;
+const static int NumSizes = 5;
+const static int NumRuns = 4;
+const static int NumChunksArray[NumSizes] = {1, 4, 16, 32, 64};
+const static size_t MaxSubTests = NumRuns * NumSizes;
+const static int NumIterArray[NumSizes] = {20, 15, 10, 10, 10};
+
+OCLP2PBuffer::OCLP2PBuffer() {
+#ifdef CL_VERSION_2_0
+  _numSubTests = MaxSubTests;
+#else
+  _numSubTests = 0;
+#endif
+  failed_ = false;
+  maxSize_ = 0;
+  context0_ = nullptr;
+  context1_ = nullptr;
+  cmdQueue0_ = nullptr;
+  cmdQueue1_ = nullptr;
+}
+
+OCLP2PBuffer::~OCLP2PBuffer() {}
+
+void OCLP2PBuffer::open(unsigned int test, char* units, double& conversion,
+                        unsigned int deviceId) {
+#ifdef CL_VERSION_2_0
+  cl_uint numPlatforms = 0;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  if (deviceCount_ < 2) {
+    printf("\nTwo GPUs are required to run P2P test\n");
+    failed_ = true;
+    return;
+  }
+
+  testID_ = test;
+  char name[1024] = {0};
+  size_t size = 0;
+  _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_EXTENSIONS, 1024, name,
+                            &size);
+  if (!strstr(name, "cl_amd_copy_buffer_p2p")) {
+    printf("P2P extension is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+
+  _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_EXTENSIONS, 1024, name,
+                            &size);
+  if (!strstr(name, "cl_amd_copy_buffer_p2p")) {
+    printf("P2P extension is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+  num_p2p_0_ = 0;
+  _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_NUM_P2P_DEVICES_AMD,
+                            sizeof(num_p2p_0_), &num_p2p_0_, nullptr);
+  if (num_p2p_0_ != 0) {
+    cl_device_id* p2p = new cl_device_id[num_p2p_0_];
+    _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_P2P_DEVICES_AMD,
+                              sizeof(cl_device_id) * num_p2p_0_, p2p, nullptr);
+    delete[] p2p;
+  }
+  num_p2p_1_ = 0;
+  _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_NUM_P2P_DEVICES_AMD,
+                            sizeof(num_p2p_1_), &num_p2p_1_, nullptr);
+  if (num_p2p_1_ != 0) {
+    cl_device_id* p2p = new cl_device_id[num_p2p_1_];
+    _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_P2P_DEVICES_AMD,
+                              sizeof(cl_device_id) * num_p2p_1_, p2p, nullptr);
+    delete[] p2p;
+  }
+
+  cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)platform, 0};
+  context0_ =
+      _wrapper->clCreateContext(props, 1, &devices_[0], NULL, 0, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext#0 failed");
+
+  context1_ =
+      _wrapper->clCreateContext(props, 1, &devices_[1], NULL, 0, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext#1 failed");
+
+  NumChunks = NumChunksArray[testID_ % NumSizes];
+  NumIter = NumIterArray[testID_ % NumSizes];
+  BufferSize = NumChunks * ChunkSize * sizeof(cl_uint);
+
+  p2p_copy_ =
+      (clEnqueueCopyBufferP2PAMD_fn)clGetExtensionFunctionAddressForPlatform(
+          platform_, "clEnqueueCopyBufferP2PAMD");
+  if (p2p_copy_ == NULL) {
+    testDescString = "Failed to initialize P2P extension!\n";
+    failed_ = true;
+    return;
+  }
+
+  cl_queue_properties prop[] = {CL_QUEUE_PROPERTIES, 0, 0};
+  cmdQueue0_ = _wrapper->clCreateCommandQueueWithProperties(
+      context0_, devices_[0], prop, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateCommandQueueWithProperties() failed");
+  cmdQueue1_ = _wrapper->clCreateCommandQueueWithProperties(
+      context1_, devices_[1], prop, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateCommandQueueWithProperties() failed");
+
+  size_t chunkSize = ChunkSize;
+
+  cl_mem buf = NULL;
+  cl_uint memFlags = 0;
+  buf = _wrapper->clCreateBuffer(context0_, CL_MEM_READ_ONLY | memFlags,
+                                 BufferSize, NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buf);
+
+  buf =
+      _wrapper->clCreateBuffer(context1_, memFlags, BufferSize, NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buf);
+#endif
+}
+
+void OCLP2PBuffer::run(void) {
+#ifdef CL_VERSION_2_0
+  if (failed_) {
+    return;
+  }
+  size_t finalBuf = 0;
+  cl_uint subTest = (testID_ / NumSizes) % 2;
+
+  cl_uint* buffer = new cl_uint[NumChunks * ChunkSize];
+  cl_uint* buffer2 = new cl_uint[NumChunks * ChunkSize];
+  cl_event event;
+
+  memset(buffer, 0x23, BufferSize);
+  error_ = _wrapper->clEnqueueWriteBuffer(cmdQueue1_, buffers_[1], CL_TRUE, 0,
+                                          BufferSize, buffer, 0, nullptr,
+                                          (subTest == 0) ? &event : nullptr);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+
+  memset(buffer2, 0xEB, BufferSize);
+  error_ = _wrapper->clEnqueueWriteBuffer(cmdQueue0_, buffers_[0], CL_TRUE, 0,
+                                          BufferSize, buffer2, 0, nullptr,
+                                          (subTest == 1) ? &event : nullptr);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+
+  CPerfCounter timer;
+
+  double sec = 0.;
+  if (subTest == 0) {
+    error_ = p2p_copy_(cmdQueue0_, buffers_[0], buffers_[1], 0, 0, BufferSize,
+                       1, &event, nullptr);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueCopyBufferP2PAMD() failed");
+    _wrapper->clFinish(cmdQueue0_);
+  } else {
+    error_ = p2p_copy_(cmdQueue1_, buffers_[1], buffers_[0], 0, 0, BufferSize,
+                       1, &event, nullptr);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueCopyBufferP2PAMD() failed");
+    _wrapper->clFinish(cmdQueue1_);
+  }
+  clReleaseEvent(event);
+  cl_command_queue execQueue;
+  if (((testID_ / NumSizes) == 0) || ((testID_ / NumSizes) == 3)) {
+    execQueue = cmdQueue0_;
+  } else {
+    execQueue = cmdQueue1_;
+  }
+
+  for (int i = 0; i < NumIter; ++i) {
+    timer.Reset();
+    timer.Start();
+
+    if (subTest == 0) {
+      p2p_copy_(execQueue, buffers_[0], buffers_[1], 0, 0, BufferSize, 0,
+                nullptr, nullptr);
+    } else {
+      p2p_copy_(execQueue, buffers_[1], buffers_[0], 0, 0, BufferSize, 0,
+                nullptr, nullptr);
+    }
+    _wrapper->clFinish(execQueue);
+    timer.Stop();
+    double cur = timer.GetElapsedTime();
+    if (i == 0) {
+      sec = cur;
+    } else {
+      sec = std::min(cur, sec);
+    }
+  }
+  memset(buffer, 0x20, BufferSize);
+  if (subTest == 0) {
+    error_ = _wrapper->clEnqueueReadBuffer(cmdQueue1_, buffers_[1], CL_TRUE, 0,
+                                           BufferSize, buffer, 0, NULL, NULL);
+  } else {
+    error_ = _wrapper->clEnqueueReadBuffer(cmdQueue0_, buffers_[0], CL_TRUE, 0,
+                                           BufferSize, buffer, 0, NULL, NULL);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed!");
+
+  cl_uint cmp_value = (subTest == 0) ? 0xEBEBEBEB : 0x23232323;
+  for (int c = 0; c < NumChunks; ++c) {
+    for (cl_uint i = 0; i < ChunkSize; ++i) {
+      if (buffer[c * ChunkSize + i] != cmp_value) {
+        CHECK_RESULT(true, "Validation failed!");
+      }
+    }
+  }
+  delete[] buffer;
+  delete[] buffer2;
+
+  cl_uint* p2p = ((subTest == 0) ? &num_p2p_0_ : &num_p2p_1_);
+  static const char* MemTypeStr[] = {"Visible  ", "Remote   ", "Invisible",
+                                     "Staging"};
+  _perfInfo = (float)BufferSize / ((float)sec * 1000.f * 1000.f * 1000.f);
+  std::stringstream str;
+  if ((testID_ / (2 * NumSizes)) == 0) {
+    str << "Write dev" << ((subTest == 0) ? 0 : 1) << "->dev"
+        << ((subTest == 0) ? 1 : 0) << ((*p2p != 0) ? " <P2P> " : " ") << "(";
+  } else {
+    str << "Read  dev" << ((subTest == 0) ? 1 : 0) << "<-dev"
+        << ((subTest == 0) ? 0 : 1) << ((*p2p != 0) ? " <P2P> " : " ") << "(";
+  }
+  str.width(2);
+  str << BufferSize / (1000 * 1000);
+  str << " MB "
+      << ") transfer speed (GB/s):";
+  testDescString = str.str();
+#endif
+}
+
+unsigned int OCLP2PBuffer::close(void) {
+#ifdef CL_VERSION_2_0
+  if (!failed_) {
+    if (cmdQueue0_ != nullptr) {
+      _wrapper->clReleaseCommandQueue(cmdQueue0_);
+    }
+    if (cmdQueue1_ != nullptr) {
+      _wrapper->clReleaseCommandQueue(cmdQueue1_);
+    }
+    if (context0_ != nullptr) {
+      _wrapper->clReleaseContext(context0_);
+    }
+    if (context1_ != nullptr) {
+      _wrapper->clReleaseContext(context1_);
+    }
+  }
+  return OCLTestImp::close();
+#else
+  return CL_SUCCESS;
+#endif
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLP2PBuffer.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLP2PBuffer.h
new file mode 100644
index 0000000000..b6fc61a478
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLP2PBuffer.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_P2P_BUFFER_H_
+#define _OCL_P2P_BUFFER_H_
+
+#include "OCLTestImp.h"
+
+class OCLP2PBuffer : public OCLTestImp {
+ public:
+  OCLP2PBuffer();
+  virtual ~OCLP2PBuffer();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+  unsigned int testID_;
+  cl_ulong maxSize_;
+  size_t BufferSize;
+  int NumChunks;
+  int NumIter;
+  int NumStages;
+  cl_context context0_;
+  cl_context context1_;
+  cl_command_queue cmdQueue0_;
+  cl_command_queue cmdQueue1_;
+  cl_uint num_p2p_0_;
+  cl_uint num_p2p_1_;
+#ifdef CL_VERSION_2_0
+  clEnqueueCopyBufferP2PAMD_fn p2p_copy_;
+#endif
+};
+
+#endif  // _OCL_LIQUID_FLASH_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPartialWrkgrp.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPartialWrkgrp.cpp
new file mode 100644
index 0000000000..1e897bafe1
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPartialWrkgrp.cpp
@@ -0,0 +1,292 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPartialWrkgrp.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+static const size_t BufSize = 0x1000;
+
+const static char* strKernel =
+    "__kernel void fillX(__global int4* out)                                \n"
+    "{                                                                      \n"
+    "   int id = get_global_id(0);                                          \n"
+    "   out[id].x = id;                                                     \n"
+    "}                                                                      \n"
+    "                                                                       \n"
+    "__kernel void fillXY(__global int4* out)                               \n"
+    "{                                                                      \n"
+    "   int id = get_global_id(0) + get_global_id(1) * get_global_size(0);  \n"
+    "   out[id].x = get_global_id(0);                                       \n"
+    "   out[id].y = get_global_id(1);                                       \n"
+    "}                                                                      \n"
+    "                                                                       \n"
+    "__kernel void fillXYZ(__global int4* out)                              \n"
+    "{                                                                      \n"
+    "   int id = get_global_id(0) + get_global_id(1) * get_global_size(0) + \n"
+    "       get_global_id(2) * get_global_size(0) * get_global_size(1);     \n"
+    "   out[id].x = get_global_id(0);                                       \n"
+    "   out[id].y = get_global_id(1);                                       \n"
+    "   out[id].z = get_global_id(2);                                       \n"
+    "}                                                                      \n";
+
+OCLPartialWrkgrp::OCLPartialWrkgrp() {
+  _numSubTests = 2;
+  isOCL2_ = true;
+}
+
+OCLPartialWrkgrp::~OCLPartialWrkgrp() {}
+
+void OCLPartialWrkgrp::open(unsigned int test, char* units, double& conversion,
+                            unsigned int deviceId) {
+  _openTest = test;
+
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  char version[128];
+  _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_VERSION,
+                            sizeof(version), version, NULL);
+
+  if (_openTest == 1 && strstr(version, "OpenCL 2.0") == NULL) {
+    isOCL2_ = false;
+    return;
+  }
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  switch (_openTest) {
+    case 0:
+      error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                        NULL, NULL);
+      break;
+    case 1:
+      error_ = _wrapper->clBuildProgram(
+          program_, 1, &devices_[deviceId],
+          "-cl-uniform-work-group-size -cl-std=CL2.0", NULL, NULL);
+      break;
+    default:
+      CHECK_RESULT(false, "Invalid test number > _numSubTests");
+      return;
+  }
+
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "fillX", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
+                                    BufSize * sizeof(cl_int4), NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLPartialWrkgrp::run(void) {
+  if (!isOCL2_) return;
+  unsigned int* values;
+  cl_mem buffer = buffers()[0];
+  values = reinterpret_cast<unsigned int*>(new cl_int4[BufSize]);
+
+  //
+  // Check unaligned workgroup in X dimension
+  //
+
+  // Clear destination buffer
+  memset(values, 0, BufSize * sizeof(cl_int4));
+  error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], buffer,
+                                          CL_TRUE, 0, BufSize * sizeof(cl_int4),
+                                          values, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  size_t gws[1] = {BufSize - 1};
+  size_t lws[1] = {256};
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, lws, 0, NULL, NULL);
+
+  switch (_openTest) {
+    case 0:
+      if (error_ != CL_SUCCESS) {
+        return;
+      }
+      error_ = _wrapper->clEnqueueReadBuffer(
+          cmdQueues_[_deviceId], buffer, CL_TRUE, 0, BufSize * sizeof(cl_int4),
+          values, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed");
+
+      for (size_t x = 0; x < BufSize; ++x) {
+        if (x == (BufSize - 1)) {
+          CHECK_RESULT((values[4 * x] != 0), "Comparison failed!");
+        } else {
+          CHECK_RESULT((values[4 * x] != x), "Comparison failed!");
+        }
+      }
+      break;
+    case 1:
+      CHECK_RESULT((error_ != CL_INVALID_WORK_GROUP_SIZE),
+                   "clEnqueueNDRangeKernel(): "
+                   "Expected to fail for non-uniform work group sizes!");
+    default:
+      CHECK_RESULT(false, "Invalid test number > _numSubTests");
+      return;
+  }
+
+  error_ = _wrapper->clReleaseKernel(kernel_);
+  CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseKernel() failed");
+
+  //
+  // Check unaligned workgroup in X and Y dimensions
+  //
+  kernel_ = _wrapper->clCreateKernel(program_, "fillXY", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  // Clear destination buffer
+  memset(values, 0, BufSize * sizeof(cl_int4));
+  error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], buffer,
+                                          CL_TRUE, 0, BufSize * sizeof(cl_int4),
+                                          values, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  size_t gws2[2] = {0x3f, 0x3f};
+  size_t lws2[2] = {16, 16};
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2,
+                                            NULL, gws2, lws2, 0, NULL, NULL);
+
+  switch (_openTest) {
+    case 0:
+      if (error_ != CL_SUCCESS) {
+        return;
+      }
+      error_ = _wrapper->clEnqueueReadBuffer(
+          cmdQueues_[_deviceId], buffer, CL_TRUE, 0, BufSize * sizeof(cl_int4),
+          values, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed");
+
+      for (size_t y = 0; y < 0x40; ++y) {
+        for (size_t x = 0; x < 0x3f; ++x) {
+          size_t id = x + y * 0x3f;
+          if (y == 0x3f) {
+            CHECK_RESULT((values[4 * id] != 0), "Comparison failed!");
+            CHECK_RESULT((values[4 * id + 1] != 0), "Comparison failed!");
+          } else {
+            CHECK_RESULT((values[4 * id] != x), "Comparison failed!");
+            CHECK_RESULT((values[4 * id + 1] != y), "Comparison failed!");
+          }
+        }
+      }
+      break;
+    case 1:
+      CHECK_RESULT((error_ != CL_INVALID_WORK_GROUP_SIZE),
+                   "clEnqueueNDRangeKernel(): "
+                   "Expected to fail for non-uniform work group sizes!");
+      break;
+    default:
+      CHECK_RESULT(false, "Invalid test number > _numSubTests");
+      return;
+  }
+
+  error_ = _wrapper->clReleaseKernel(kernel_);
+  CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseKernel() failed");
+
+  //
+  // Check unaligned workgroup in X, Y and Z dimensions
+  //
+  kernel_ = _wrapper->clCreateKernel(program_, "fillXYZ", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  // Clear destination buffer
+  memset(values, 0, BufSize * sizeof(cl_int4));
+  error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], buffer,
+                                          CL_TRUE, 0, BufSize * sizeof(cl_int4),
+                                          values, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  size_t gws3[3] = {0xf, 0x10, 0xf};
+  size_t lws3[3] = {4, 4, 4};
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 3,
+                                            NULL, gws3, lws3, 0, NULL, NULL);
+  switch (_openTest) {
+    case 0:
+      if (error_ != CL_SUCCESS) {
+        return;
+      }
+      error_ = _wrapper->clEnqueueReadBuffer(
+          cmdQueues_[_deviceId], buffer, CL_TRUE, 0, BufSize * sizeof(cl_int4),
+          values, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed");
+
+      for (size_t z = 0; z < 0x10; ++z) {
+        for (size_t y = 0; y < 0x10; ++y) {
+          for (size_t x = 0; x < 0xf; ++x) {
+            size_t id = x + y * 0xf + z * 0xf0;
+            if (z == 0xf) {
+              CHECK_RESULT((values[4 * id] != 0), "Comparison failed!");
+              CHECK_RESULT((values[4 * id + 1] != 0), "Comparison failed!");
+              CHECK_RESULT((values[4 * id + 2] != 0), "Comparison failed!");
+            } else {
+              CHECK_RESULT((values[4 * id] != x), "Comparison failed!");
+              CHECK_RESULT((values[4 * id + 1] != y), "Comparison failed!");
+              CHECK_RESULT((values[4 * id + 2] != z), "Comparison failed!");
+            }
+          }
+        }
+      }
+      break;
+    case 1:
+      CHECK_RESULT((error_ != CL_INVALID_WORK_GROUP_SIZE),
+                   "clEnqueueNDRangeKernel(): "
+                   "Expected fail for non-uniform work group sizes!");
+      break;
+    default:
+      CHECK_RESULT(false, "Invalid test number > _numSubTests");
+      return;
+  }
+
+  delete[] values;
+}
+
+unsigned int OCLPartialWrkgrp::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPartialWrkgrp.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPartialWrkgrp.h
new file mode 100644
index 0000000000..20666e157f
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPartialWrkgrp.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PARTIAL_WRKGRP_H_
+#define _OCL_PARTIAL_WRKGRP_H_
+
+#include "OCLTestImp.h"
+
+class OCLPartialWrkgrp : public OCLTestImp {
+ public:
+  OCLPartialWrkgrp();
+  virtual ~OCLPartialWrkgrp();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool isOCL2_;
+};
+
+#endif  // _OCL_PARTIAL_WRKGRP_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp
new file mode 100644
index 0000000000..dd434701b5
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp
@@ -0,0 +1,798 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPerfCounters.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "Timer.h"
+
+#ifdef WIN_OS
+#define SNPRINTF sprintf_s
+#else
+#define SNPRINTF snprintf
+#endif
+
+struct PerfCounterInfo {
+  cl_long blockIdx;    //!< Block Index
+  cl_long counterIdx;  //!< Counter Index
+  cl_long eventIdx;    //!< Event Index
+};
+
+struct DeviceCounterInfo {
+  const char *deviceName_;          //!< Device name
+  unsigned int devId_;              //!< Device id
+  PerfCounterInfo perfCounter_[2];  //!< Perforamnce counter array
+};
+
+static const DeviceCounterInfo DeviceInfo[]{
+    // GFX10
+    {"gfx1000",
+     10,
+     {{15, 0, 4}, {77, 1, 2}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l,
+                                 // reg 0, BigK bank 0 hits}
+    {"gfx1010",
+     10,
+     {{15, 0, 4}, {77, 1, 2}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l,
+                                 // reg 0, BigK bank 0 hits}
+    {"gfx1011",
+     10,
+     {{15, 0, 4}, {77, 1, 2}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l,
+                                 // reg 0, BigK bank 0 hits}
+    {"gfx1012",
+     10,
+     {{15, 0, 4}, {77, 1, 2}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l,
+                                 // reg 0, BigK bank 0 hits}
+    // GFX9
+    {"gfx900",
+     9,
+     {{14, 0, 4}, {97, 1, 2}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l,
+                                 // reg 0, BigK bank 0 hits}
+    {"gfx901",
+     9,
+     {{14, 0, 4}, {97, 1, 2}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l,
+                                 // reg 0, BigK bank 0 hits}
+    {"gfx902",
+     9,
+     {{14, 0, 4}, {97, 1, 2}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l,
+                                 // reg 0, BigK bank 0 hits}
+    {"gfx903",
+     9,
+     {{14, 0, 4}, {97, 1, 2}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l,
+                                 // reg 0, BigK bank 0 hits}
+    {"gfx904",
+     9,
+     {{14, 0, 4}, {97, 1, 2}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l,
+                                 // reg 0, BigK bank 0 hits}
+    {"gfx905",
+     9,
+     {{14, 0, 4}, {97, 1, 2}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l,
+                                 // reg 0, BigK bank 0 hits}
+    {"gfx906",
+     9,
+     {{14, 0, 4}, {97, 1, 2}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l,
+                                 // reg 0, BigK bank 0 hits}
+    {"gfx907",
+     9,
+     {{14, 0, 4}, {97, 1, 2}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l,
+                                 // reg 0, BigK bank 0 hits}
+    // Sea Islands, GFX8
+    {"Bonaire",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Hawaii",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Maui",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Casper",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Spectre",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Slimer",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Spooky",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Kalindi",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Mullins",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Iceland",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Tonga",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Bermuda",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Fiji",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Carrizo",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Ellesmere",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Baffin",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Stoney",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"gfx804",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"gfx803",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Bristol Ridge",
+     0,
+     {{14, 0, 4}, {9, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    // Southern Islands
+    {"Tahiti",
+     0,
+     {{10, 0, 4}, {5, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Pitcairn",
+     0,
+     {{10, 0, 4}, {5, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Capeverde",
+     0,
+     {{10, 0, 4}, {5, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Oland",
+     0,
+     {{10, 0, 4}, {5, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+    {"Hainan",
+     0,
+     {{10, 0, 4}, {5, 0, 3}}},  // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0,
+                                // GRBM_PERF_SEL_CP_BUSY}
+};
+const int DeviceCounterSize = sizeof(DeviceInfo) / sizeof(DeviceCounterInfo);
+
+static const char *sha256_kernel =
+    "typedef uint UINT;\n"
+    "\n"
+    "#define VECTOR_LEN 1\n"
+    "\n"
+    "#ifdef LITTLE_E\n"
+    "\n"
+    "inline UINT byteswap(UINT x)\n"
+    "{\n"
+    "    UINT res = 0;\n"
+    "    \n"
+    "    for (uint i=0; i<4; i++)\n"
+    "    {\n"
+    "        res <<= 8;\n"
+    "        res |= (x & 0xff);\n"
+    "        x >>= 8;\n"
+    "    }\n"
+    "    \n"
+    "    return res;\n"
+    "}\n"
+    "\n"
+    "#else\n"
+    "\n"
+    "inline UINT byteswap(const UINT x)\n"
+    "{\n"
+    "    return x;\n"
+    "}\n"
+    "\n"
+    "#endif\n"
+    "\n"
+    "\n"
+    "void sha256_step( const UINT data[16], UINT *state )\n"
+    "{\n"
+    "   UINT W[64], temp1, temp2;\n"
+    "   UINT A, B, C, D, E, F, G, H;\n"
+    "\n"
+    "   for( int i = 0; i < 16; i++)\n"
+    "   {\n"
+    "      W[i] = byteswap(data[i]);\n"
+    "   }\n"
+    "\n"
+    "#define SHR(x,n)  ((x & 0xFFFFFFFF) >> n)\n"
+    "#define ROTR(x,n) (SHR(x,n) | (x << (32 - n)))\n"
+    "\n"
+    "#define S0(x) (ROTR(x, 7) ^ ROTR(x,18) ^  SHR(x, 3))\n"
+    "#define S1(x) (ROTR(x,17) ^ ROTR(x,19) ^  SHR(x,10))\n"
+    "\n"
+    "#define S2(x) (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))\n"
+    "#define S3(x) (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25))\n"
+    "\n"
+    "#define F0(x,y,z) ((x & y) | (z & (x | y)))\n"
+    "#define F1(x,y,z) (z ^ (x & (y ^ z)))\n"
+    "\n"
+    "#define R(t)                                    \\\n"
+    "(                                               \\\n"
+    "    W[t] = S1(W[t -  2]) + W[t -  7] +          \\\n"
+    "           S0(W[t - 15]) + W[t - 16]            \\\n"
+    ")\n"
+    "\n"
+    "#define P(a,b,c,d,e,f,g,h,x,K)                  \\\n"
+    "{                                               \\\n"
+    "    temp1 = h + S3(e) + F1(e,f,g) + K + x;      \\\n"
+    "    temp2 = S2(a) + F0(a,b,c);                  \\\n"
+    "    d += temp1; h = temp1 + temp2;              \\\n"
+    "}\n"
+    "\n"
+    "    A = state[0];\n"
+    "    B = state[1];\n"
+    "    C = state[2];\n"
+    "    D = state[3];\n"
+    "    E = state[4];\n"
+    "    F = state[5];\n"
+    "    G = state[6];\n"
+    "    H = state[7];\n"
+    "\n"
+    "    P( A, B, C, D, E, F, G, H, W[ 0], 0x428A2F98 );\n"
+    "    P( H, A, B, C, D, E, F, G, W[ 1], 0x71374491 );\n"
+    "    P( G, H, A, B, C, D, E, F, W[ 2], 0xB5C0FBCF );\n"
+    "    P( F, G, H, A, B, C, D, E, W[ 3], 0xE9B5DBA5 );\n"
+    "    P( E, F, G, H, A, B, C, D, W[ 4], 0x3956C25B );\n"
+    "    P( D, E, F, G, H, A, B, C, W[ 5], 0x59F111F1 );\n"
+    "    P( C, D, E, F, G, H, A, B, W[ 6], 0x923F82A4 );\n"
+    "    P( B, C, D, E, F, G, H, A, W[ 7], 0xAB1C5ED5 );\n"
+    "    P( A, B, C, D, E, F, G, H, W[ 8], 0xD807AA98 );\n"
+    "    P( H, A, B, C, D, E, F, G, W[ 9], 0x12835B01 );\n"
+    "    P( G, H, A, B, C, D, E, F, W[10], 0x243185BE );\n"
+    "    P( F, G, H, A, B, C, D, E, W[11], 0x550C7DC3 );\n"
+    "    P( E, F, G, H, A, B, C, D, W[12], 0x72BE5D74 );\n"
+    "    P( D, E, F, G, H, A, B, C, W[13], 0x80DEB1FE );\n"
+    "    P( C, D, E, F, G, H, A, B, W[14], 0x9BDC06A7 );\n"
+    "    P( B, C, D, E, F, G, H, A, W[15], 0xC19BF174 );\n"
+    "    P( A, B, C, D, E, F, G, H, R(16), 0xE49B69C1 );\n"
+    "    P( H, A, B, C, D, E, F, G, R(17), 0xEFBE4786 );\n"
+    "    P( G, H, A, B, C, D, E, F, R(18), 0x0FC19DC6 );\n"
+    "    P( F, G, H, A, B, C, D, E, R(19), 0x240CA1CC );\n"
+    "    P( E, F, G, H, A, B, C, D, R(20), 0x2DE92C6F );\n"
+    "    P( D, E, F, G, H, A, B, C, R(21), 0x4A7484AA );\n"
+    "    P( C, D, E, F, G, H, A, B, R(22), 0x5CB0A9DC );\n"
+    "    P( B, C, D, E, F, G, H, A, R(23), 0x76F988DA );\n"
+    "    P( A, B, C, D, E, F, G, H, R(24), 0x983E5152 );\n"
+    "    P( H, A, B, C, D, E, F, G, R(25), 0xA831C66D );\n"
+    "    P( G, H, A, B, C, D, E, F, R(26), 0xB00327C8 );\n"
+    "    P( F, G, H, A, B, C, D, E, R(27), 0xBF597FC7 );\n"
+    "    P( E, F, G, H, A, B, C, D, R(28), 0xC6E00BF3 );\n"
+    "    P( D, E, F, G, H, A, B, C, R(29), 0xD5A79147 );\n"
+    "    P( C, D, E, F, G, H, A, B, R(30), 0x06CA6351 );\n"
+    "    P( B, C, D, E, F, G, H, A, R(31), 0x14292967 );\n"
+    "    P( A, B, C, D, E, F, G, H, R(32), 0x27B70A85 );\n"
+    "    P( H, A, B, C, D, E, F, G, R(33), 0x2E1B2138 );\n"
+    "    P( G, H, A, B, C, D, E, F, R(34), 0x4D2C6DFC );\n"
+    "    P( F, G, H, A, B, C, D, E, R(35), 0x53380D13 );\n"
+    "    P( E, F, G, H, A, B, C, D, R(36), 0x650A7354 );\n"
+    "    P( D, E, F, G, H, A, B, C, R(37), 0x766A0ABB );\n"
+    "    P( C, D, E, F, G, H, A, B, R(38), 0x81C2C92E );\n"
+    "    P( B, C, D, E, F, G, H, A, R(39), 0x92722C85 );\n"
+    "    P( A, B, C, D, E, F, G, H, R(40), 0xA2BFE8A1 );\n"
+    "    P( H, A, B, C, D, E, F, G, R(41), 0xA81A664B );\n"
+    "    P( G, H, A, B, C, D, E, F, R(42), 0xC24B8B70 );\n"
+    "    P( F, G, H, A, B, C, D, E, R(43), 0xC76C51A3 );\n"
+    "    P( E, F, G, H, A, B, C, D, R(44), 0xD192E819 );\n"
+    "    P( D, E, F, G, H, A, B, C, R(45), 0xD6990624 );\n"
+    "    P( C, D, E, F, G, H, A, B, R(46), 0xF40E3585 );\n"
+    "    P( B, C, D, E, F, G, H, A, R(47), 0x106AA070 );\n"
+    "    P( A, B, C, D, E, F, G, H, R(48), 0x19A4C116 );\n"
+    "    P( H, A, B, C, D, E, F, G, R(49), 0x1E376C08 );\n"
+    "    P( G, H, A, B, C, D, E, F, R(50), 0x2748774C );\n"
+    "    P( F, G, H, A, B, C, D, E, R(51), 0x34B0BCB5 );\n"
+    "    P( E, F, G, H, A, B, C, D, R(52), 0x391C0CB3 );\n"
+    "    P( D, E, F, G, H, A, B, C, R(53), 0x4ED8AA4A );\n"
+    "    P( C, D, E, F, G, H, A, B, R(54), 0x5B9CCA4F );\n"
+    "    P( B, C, D, E, F, G, H, A, R(55), 0x682E6FF3 );\n"
+    "    P( A, B, C, D, E, F, G, H, R(56), 0x748F82EE );\n"
+    "    P( H, A, B, C, D, E, F, G, R(57), 0x78A5636F );\n"
+    "    P( G, H, A, B, C, D, E, F, R(58), 0x84C87814 );\n"
+    "    P( F, G, H, A, B, C, D, E, R(59), 0x8CC70208 );\n"
+    "    P( E, F, G, H, A, B, C, D, R(60), 0x90BEFFFA );\n"
+    "    P( D, E, F, G, H, A, B, C, R(61), 0xA4506CEB );\n"
+    "    P( C, D, E, F, G, H, A, B, R(62), 0xBEF9A3F7 );\n"
+    "    P( B, C, D, E, F, G, H, A, R(63), 0xC67178F2 );\n"
+    "\n"
+    "    state[0] += A;\n"
+    "    state[1] += B;\n"
+    "    state[2] += C;\n"
+    "    state[3] += D;\n"
+    "    state[4] += E;\n"
+    "    state[5] += F;\n"
+    "    state[6] += G;\n"
+    "    state[7] += H;\n"
+    "}\n"
+    "\n"
+    "\n"
+    "#define choose_temp(x) ((x)/16)\n"
+    "\n"
+    "#define STORE_TO_TEMP(i) tb[((i)/16)][((i)%16)]\n"
+    "\n"
+    "\n"
+    "__kernel void CryptThread(__global const uint *buffer, __global uint "
+    "*state, const uint blockLen, const uint foo)\n"
+    "{\n"
+    "    const uint init[8] = {\n"
+    "        0x6a09e667,\n"
+    "        0xbb67ae85,\n"
+    "        0x3c6ef372,\n"
+    "        0xa54ff53a,\n"
+    "        0x510e527f,\n"
+    "        0x9b05688c,\n"
+    "        0x1f83d9ab,\n"
+    "        0x5be0cd19\n"
+    "    };\n"
+    "    \n"
+    "    const uint id = get_global_id(0);\n"
+    "    uint len = blockLen;\n"
+    "    uint i, j;\n"
+    "    const uint startPosInDWORDs = (len*id*foo)/4;\n"
+    "    const uint msgLenInBitsl = len * 8;\n"
+    "    const uint msgLenInBitsh = (len) >> (32-3);\n"
+    "    UINT localState[8];\n"
+    "\n"
+    "    for (j=0; j<8; j++) {\n"
+    "        localState[j] = init[j];\n"
+    "    }\n"
+    "\n"
+    "    i = 0;\n"
+    "    while (len >=64)\n"
+    "    {\n"
+    "        UINT data[16];\n"
+    "        for (j=0; j<16; j++) {\n"
+    "            data[j] = buffer[j + startPosInDWORDs + i];\n"
+    "        }\n"
+    "\n"
+    "        sha256_step(data, localState);\n"
+    "        i += 16;\n"
+    "        len -= 64;\n"
+    "    }\n"
+    "\n"
+    "    len /= 4;\n"
+    "\n"
+    "    UINT tb[2][16];\n"
+    "\n"
+    "    for (j=0; j<len; j++) \n"
+    "    {\n"
+    "        STORE_TO_TEMP(j) = buffer[j + startPosInDWORDs + i];\n"
+    "    }\n"
+    "\n"
+    "#ifdef LITTLE_E\n"
+    "    STORE_TO_TEMP(len) = 0x80;\n"
+    "#else\n"
+    "    STORE_TO_TEMP(len) = byteswap(0x80000000);\n"
+    "#endif\n"
+    "\n"
+    "    i = len+1;\n"
+    "\n"
+    "    while ((i % (512/32)) != (448/32))\n"
+    "    {\n"
+    "        STORE_TO_TEMP(i) = 0;\n"
+    "        i++;\n"
+    "    }\n"
+    "\n"
+    "#ifdef LITTLE_E\n"
+    "    {\n"
+    "        STORE_TO_TEMP(i) = byteswap(msgLenInBitsh);\n"
+    "        STORE_TO_TEMP(i + 1) = byteswap(msgLenInBitsl);\n"
+    "        i += 2;\n"
+    "    }\n"
+    "\n"
+    "#else\n"
+    "#endif\n"
+    "    \n"
+    "    sha256_step(tb[0], localState);\n"
+    "    if (32 == i)\n"
+    "    {\n"
+    "        sha256_step(tb[1], localState);\n"
+    "    }\n"
+    "    \n"
+    "    for (j=0; j<8; j++)\n"
+    "    {\n"
+    "        state[id*8 + j] = localState[j];\n"
+    "    }\n"
+    "}\n";
+
+#define NUM_COUNTERS 2
+
+cl_device_id global_device;
+
+OCLPerfCounters::OCLPerfCounters() { _numSubTests = NUM_COUNTERS; }
+
+OCLPerfCounters::~OCLPerfCounters() {}
+
+bool OCLPerfCounters::setData(cl_mem buffer, unsigned int val) {
+  bool retVal = false;
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+
+  if (error_ != CL_SUCCESS) {
+    printf("\nError code : %d\n", error_);
+  } else {
+    for (unsigned int i = 0; i < width_; i++) data[i] = val;
+    error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0,
+                                               NULL, NULL);
+    if (error_ == CL_SUCCESS) retVal = true;
+  }
+  return retVal;
+}
+
+void OCLPerfCounters::checkData(cl_mem buffer) {
+  unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
+      cmd_queue_, buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL,
+      &error_);
+  for (unsigned int i = 0; i < width_; i++) {
+  }
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
+                                             NULL);
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLPerfCounters::open(unsigned int test, char *units, double &conversion,
+                           unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id *devices = NULL;
+  cl_device_id device = NULL;
+  _crcword = 0;
+  conversion = 1.0f;
+  _deviceId = deviceId;
+  _openTest = test;
+
+  context_ = 0;
+  cmd_queue_ = 0;
+  program_ = 0;
+  kernel_ = 0;
+  inBuffer_ = 0;
+  outBuffer_ = 0;
+  num_input_buf_ = 1;
+  num_output_buf_ = 1;
+  blockSize_ = 1024;
+  isAMD = false;
+
+  if (type_ != CL_DEVICE_TYPE_GPU) {
+    char msg[256];
+    SNPRINTF(msg, sizeof(msg), "No GPU devices present. Exiting!\t");
+    testDescString = msg;
+    return;
+  }
+
+  width_ = 22347776;
+  // We compute a square domain
+  bufSize_ = width_ * sizeof(cl_uint);
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+        // Get last for default
+        platform = platforms[numPlatforms-1];
+        for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+    // Runtime returns an error when no GPU devices are present instead of just
+    // returning 0 devices
+    // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+    // Choose platform with GPU devices
+    if (num_devices > 0) {
+      if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
+        isAMD = true;
+      }
+      // platform = platforms[_platformIndex];
+      // break;
+    }
+#if 0
+        }
+#endif
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0,
+               "Couldn't find platform with GPU devices, cannot proceed");
+
+  devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+
+  global_device = device;
+
+  context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
+                                       &error_);
+  CHECK_RESULT(context_ == 0, "clCreateContext failed");
+
+  char charbuf[1024];
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
+                                     charbuf, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
+  CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
+
+  inBuffer_ = new cl_mem[4];
+  outBuffer_ = new cl_mem[4];
+
+  for (int i = 0; i < num_input_buf_; ++i) {
+    inBuffer_[i] =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(inBuffer_[i] == 0, "clCreateBuffer(inBuffer) failed");
+    bool result = setData(inBuffer_[i], 0xdeadbeef);
+    CHECK_RESULT(result != true, "clEnqueueMapBuffer buffer failed");
+  }
+
+  for (int i = 0; i < num_output_buf_; ++i) {
+    outBuffer_[i] =
+        _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
+    CHECK_RESULT(outBuffer_[i] == 0, "clCreateBuffer(outBuffer) failed");
+    bool result = setData(outBuffer_[i], 0xdeadbeef);
+    CHECK_RESULT(result != true, "clEnqueueMapBuffer buffer failed");
+  }
+
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, (const char **)&sha256_kernel, NULL, &error_);
+  CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
+
+  const char *buildOps = NULL;
+  if (isAMD) {
+    // Enable caching
+    buildOps = "-fno-alias";
+  }
+  error_ = _wrapper->clBuildProgram(program_, 1, &device, buildOps, NULL, NULL);
+
+  if (error_ != CL_SUCCESS) {
+    cl_int intError;
+    char log[16384];
+    intError =
+        _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
+                                        16384 * sizeof(char), log, NULL);
+    printf("Build error -> %s\n", log);
+
+    CHECK_RESULT(0, "clBuildProgram failed");
+  }
+  kernel_ = _wrapper->clCreateKernel(program_, "CryptThread", &error_);
+  CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                    (void *)&inBuffer_[0]);
+  error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem),
+                                    (void *)&outBuffer_[0]);
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint),
+                                    (void *)&blockSize_);
+  // Foo is not part of the original test, this can be used to see how much of
+  // the performance is limited by fetch. Set foo to 0 and all threads will
+  // fetch the same 1k block.  This way they will all be in cache and hit max
+  // fetch speed.
+  unsigned int foo = 1;
+  error_ = _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_uint), (void *)&foo);
+}
+
+void OCLPerfCounters::run(void) {
+  // Test runs only on GPU
+  if (type_ != CL_DEVICE_TYPE_GPU) return;
+
+  size_t global = bufSize_ / blockSize_;
+  // 32 gives the best result due to memory thrashing.  Need to optimize and
+  // give feedback to SiSoft.
+  size_t local = 64;
+  char buf[256];
+
+  size_t global_work_size[1] = {global};
+  size_t local_work_size[1] = {local};
+
+  cl_int err = 0;
+  cl_perfcounter_amd perfCounter;
+  cl_perfcounter_property properties[4][2];
+  cl_event perfEvent;
+  cl_ulong result;
+  char deviceName[1024];
+
+  properties[0][0] = CL_PERFCOUNTER_GPU_BLOCK_INDEX;
+  properties[1][0] = CL_PERFCOUNTER_GPU_COUNTER_INDEX;
+  properties[2][0] = CL_PERFCOUNTER_GPU_EVENT_INDEX;
+  properties[3][0] = CL_PERFCOUNTER_NONE;
+
+  err = _wrapper->clGetDeviceInfo(global_device, CL_DEVICE_NAME, 1024,
+                                  deviceName, NULL);
+  CHECK_RESULT(err != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  // Begin: to be removed when crash on Kabini is fixed
+  if (strcmp(deviceName, "Kalindi") == 0) {
+    char msg[256];
+    SNPRINTF(msg, sizeof(msg), "Exiting as device is Kabini!\t");
+    testDescString = msg;
+    return;
+  }
+  // End: to be removed when crash on Kabini is fixed
+
+  bool found = false;
+  unsigned int devId = 0;
+  for (int idx = 0; !found && idx < DeviceCounterSize; idx++) {
+    if (strcmp(deviceName, DeviceInfo[idx].deviceName_) == 0) {
+      devId = DeviceInfo[idx].devId_;
+      properties[0][1] = DeviceInfo[idx].perfCounter_[_openTest].blockIdx;
+      properties[1][1] = DeviceInfo[idx].perfCounter_[_openTest].counterIdx;
+      properties[2][1] = DeviceInfo[idx].perfCounter_[_openTest].eventIdx;
+      found = true;
+    }
+  }
+
+  if (!found) {
+    char msg[256];
+    SNPRINTF(msg, sizeof(msg), "Unsupported device(%s) for the test!\t",
+             deviceName);
+    testDescString = msg;
+    return;
+  }
+
+  perfCounter =
+      _wrapper->clCreatePerfCounterAMD(global_device, &properties[0][0], &err);
+  CHECK_RESULT(err != CL_SUCCESS, "Create PerfCounter failed\n");
+
+  // set clock mode
+  cl_set_device_clock_mode_input_amd setClockModeInput;
+  setClockModeInput.clock_mode = CL_DEVICE_CLOCK_MODE_PROFILING_AMD;
+  cl_set_device_clock_mode_output_amd setClockModeOutput = {};
+  _wrapper->clSetDeviceClockModeAMD(global_device, setClockModeInput,
+                                    &setClockModeOutput);
+
+  _wrapper->clEnqueueBeginPerfCounterAMD(cmd_queue_, 1, &perfCounter, 0, NULL,
+                                         NULL);
+
+  for (unsigned int i = 0; i < MAX_ITERATIONS; i++) {
+    error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                      (void *)&inBuffer_[i % num_input_buf_]);
+    error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem),
+                                      (void *)&outBuffer_[i % num_output_buf_]);
+
+    error_ = _wrapper->clEnqueueNDRangeKernel(
+        cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
+        (const size_t *)local_work_size, 0, NULL, NULL);
+  }
+
+  CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueNDRangeKernel failed");
+
+  _wrapper->clEnqueueEndPerfCounterAMD(cmd_queue_, 1, &perfCounter, 0, NULL,
+                                       &perfEvent);
+  clWaitForEvents(1, &perfEvent);
+
+  // set clock mode to default
+  setClockModeInput.clock_mode = CL_DEVICE_CLOCK_MODE_DEFAULT_AMD;
+  _wrapper->clSetDeviceClockModeAMD(global_device, setClockModeInput,
+                                    &setClockModeOutput);
+
+  _wrapper->clGetPerfCounterInfoAMD(perfCounter, CL_PERFCOUNTER_DATA,
+                                    sizeof(cl_ulong), &result, NULL);
+
+  err = _wrapper->clReleasePerfCounterAMD(perfCounter);
+  CHECK_RESULT(err != CL_SUCCESS, "Release PerfCounter failed\n");
+
+  switch (_openTest) {
+    case 0:
+      SNPRINTF(buf, sizeof(buf), "SQ Number of Waves: %lu  ", (long)result);
+      break;
+    case 1:
+      if (devId >= 9) {
+        SNPRINTF(buf, sizeof(buf), "BigK Bank0 hits: %lu  ", (long)result);
+      } else {
+        SNPRINTF(buf, sizeof(buf), "GRBM CP Busy: %lu  ", (long)result);
+      }
+      break;
+  }
+
+  testDescString = buf;
+  CHECK_RESULT(!(result > 0), "Perf counter value read is zero!\n");
+}
+
+unsigned int OCLPerfCounters::close(void) {
+  _wrapper->clFinish(cmd_queue_);
+
+  if (inBuffer_) {
+    for (int i = 0; i < num_input_buf_; ++i) {
+      error_ = _wrapper->clReleaseMemObject(inBuffer_[i]);
+      CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                             "clReleaseMemObject(inBuffer_) failed");
+    }
+    delete[] inBuffer_;
+  }
+  if (outBuffer_) {
+    for (int i = 0; i < num_output_buf_; ++i) {
+      error_ = _wrapper->clReleaseMemObject(outBuffer_[i]);
+      CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                             "clReleaseMemObject(outBuffer_) failed");
+    }
+    delete[] outBuffer_;
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (cmd_queue_) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (context_) {
+    error_ = _wrapper->clReleaseContext(context_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+
+  return _crcword;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPerfCounters.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPerfCounters.h
new file mode 100644
index 0000000000..89751e3e41
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPerfCounters.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLTestImp.h"
+
+class OCLPerfCounters : public OCLTestImp {
+ public:
+  OCLPerfCounters();
+  virtual ~OCLPerfCounters();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+  std::string shader_;
+  bool setData(cl_mem buffer, unsigned int data);
+  void checkData(cl_mem buffer);
+  cl_context context_;
+  cl_command_queue cmd_queue_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_mem* inBuffer_;
+  cl_mem* outBuffer_;
+  cl_int num_input_buf_;
+  cl_int num_output_buf_;
+  cl_int error_;
+  unsigned int width_;
+  unsigned int bufSize_;
+  unsigned int blockSize_;
+  static const unsigned int MAX_ITERATIONS = 1;
+  bool isAMD;
+};
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPersistent.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPersistent.cpp
new file mode 100644
index 0000000000..5151ae9c12
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPersistent.cpp
@@ -0,0 +1,139 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPersistent.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+const static char* strKernel =
+    "__kernel void persistentImage( write_only image2d_t source){   \n"
+    "    int  tidX = get_global_id(0);\n"
+    "    int  tidY = get_global_id(1);\n"
+    "    write_imagei( source, (int2)( tidX, tidY ),(int4)( tidX, tidY,0,0 ) "
+    ");\n"
+    "}\n";
+
+OCLPersistent::OCLPersistent() : clImage_(0) { _numSubTests = 1; }
+
+OCLPersistent::~OCLPersistent() {}
+
+void OCLPersistent::open(unsigned int test, char* units, double& conversion,
+                         unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  if (_errorFlag) return;
+
+  // Build the kernel
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed!");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed!");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "persistentImage", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed!");
+  cl_image_format format;
+  format.image_channel_data_type = CL_SIGNED_INT32;
+  format.image_channel_order = CL_RG;
+  cl_image_desc desc = {0};
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = c_dimSize;
+  desc.image_height = c_dimSize;
+  desc.image_depth = 1;
+  desc.image_array_size = 1;
+  // CL_MEM_USE_PERSISTENT_MEM_AMD
+  clImage_ =
+      clCreateImage(context_, CL_MEM_USE_PERSISTENT_MEM_AMD | CL_MEM_WRITE_ONLY,
+                    &format, &desc, NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateImage() failed");
+}
+
+void OCLPersistent::run(void) {
+  _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clImage_);
+
+  size_t dimSizes[] = {c_dimSize, c_dimSize};
+
+  size_t origin[] = {0, 0, 0};
+  size_t region[] = {c_dimSize, c_dimSize, 1};
+  size_t pitch, slice;
+  cl_event event;
+  error_ = _wrapper->clEnqueueNDRangeKernel(
+      cmdQueues_[_deviceId], kernel_, 2, NULL, dimSizes, NULL, 0, NULL, NULL);
+  error_ = _wrapper->clEnqueueMarkerWithWaitList(cmdQueues_[_deviceId], 0, NULL,
+                                                 &event);
+
+  _wrapper->clFlush(cmdQueues_[_deviceId]);
+
+  cl_uint status;
+  _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(cl_uint), &status, NULL);
+  while (status != CL_COMPLETE) {
+    _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                             sizeof(cl_uint), &status, NULL);
+  }
+
+  unsigned int* image = (unsigned int*)_wrapper->clEnqueueMapImage(
+      cmdQueues_[_deviceId], clImage_, CL_TRUE, CL_MAP_READ, origin, region,
+      &pitch, &slice, 0, NULL, NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueMapImage() failed");
+
+  bool result = validateImage(image, pitch, c_dimSize);
+  CHECK_RESULT(!result, "Validation failed!");
+
+  _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], clImage_, image, 0,
+                                    NULL, NULL);
+}
+
+unsigned int OCLPersistent::close(void) {
+  _wrapper->clReleaseMemObject(clImage_);
+
+  return OCLTestImp::close();
+}
+
+bool OCLPersistent::validateImage(unsigned int* image, size_t pitch,
+                                  unsigned int dimSize) {
+  unsigned int x, y;
+  int idx = 0;
+  for (y = 0; y < dimSize; y++) {
+    for (x = 0; x < dimSize; x++) {
+      if ((image[idx] != x) || (image[idx + 1] != y)) {
+        printf("Failed at coordinate (%5d, %5d) - R:%d, G:%d value\n", x, y,
+               image[idx], image[idx + 1]);
+        return false;
+      }
+      idx += 2;
+    }
+    image += pitch / sizeof(int);
+    idx = 0;
+  }
+  return true;
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPersistent.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPersistent.h
new file mode 100644
index 0000000000..a7585db0a9
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPersistent.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PERSISTENT_H_
+#define _OCL_PERSISTENT_H_
+
+#include "OCLTestImp.h"
+
+class OCLPersistent : public OCLTestImp {
+ public:
+  OCLPersistent();
+  virtual ~OCLPersistent();
+  static const unsigned int c_dimSize = 510;
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceId);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  ////////////////////
+  // test functions //
+  ////////////////////
+
+  bool validateImage(unsigned int* image, size_t pitch, unsigned int dimSize);
+  /////////////////////
+  // private members //
+  /////////////////////
+
+  // CL identifiers
+  cl_mem clImage_;
+};
+
+#endif  // _OCL_GL_BUFFER_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPinnedMemory.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPinnedMemory.cpp
new file mode 100644
index 0000000000..c67d8ed620
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPinnedMemory.cpp
@@ -0,0 +1,218 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPinnedMemory.h"
+
+#ifdef _WIN32
+#include <VersionHelpers.h>
+// Pick up from OCLSVM
+size_t getTotalSystemMemory();
+#else
+#include <sys/sysinfo.h>
+size_t getTotalSystemMemory() {
+  struct sysinfo info;
+  sysinfo(&info);
+  return info.totalram;
+}
+#endif
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+
+OCLPinnedMemory::OCLPinnedMemory() { _numSubTests = 2; }
+
+OCLPinnedMemory::~OCLPinnedMemory() {}
+
+void OCLPinnedMemory::open(unsigned int test, char* units, double& conversion,
+                           unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_ERROR(error_, "Error opening test");
+  _openTest = test;
+  host_memory_ = nullptr;
+
+#ifdef _WIN32
+  // Observed failures on Win7
+  if (!IsWindows8OrGreater()) {
+    printf("Test requires Win10, skipping...\n");
+    _openTest = -1;
+    return;
+  }
+#endif
+
+  cl_int status;
+
+  // Observed failures with Carrizo on GSL path
+  cl_bool is_apu;
+  status = clGetDeviceInfo(devices_[deviceId], CL_DEVICE_HOST_UNIFIED_MEMORY,
+                           sizeof(cl_bool), &is_apu, nullptr);
+  CHECK_ERROR(status, "clGetDeviceInfo failed.");
+  if (is_apu) {
+    printf("Test not supported for apus, skipping...\n");
+    _openTest = -1;
+    return;
+  }
+
+  cl_uint address_bits;
+  status = clGetDeviceInfo(devices_[deviceId], CL_DEVICE_ADDRESS_BITS,
+                           sizeof(cl_uint), &address_bits, nullptr);
+  CHECK_ERROR(status, "clGetDeviceInfo failed.");
+  if (address_bits < 64u) {
+    printf("GPU VA range size below 4GB, skipping...\n");
+    _openTest = -1;
+    return;
+  }
+
+  row_size_ = getTotalSystemMemory();
+  if (row_size_ <= (1ull << 32u)) {
+    printf("System memory below 4GB, skipping...\n");
+    _openTest = -1;
+    return;
+  }
+  row_size_ *= ratio_;
+  row_size_ = floor(sqrt(row_size_));
+  row_size_ = (row_size_ + row_data_size_ - 1) & ~(row_data_size_ - 1);
+
+  pin_size_ = row_size_ * row_size_ / row_data_size_;
+  host_memory_ = new row_data_t[pin_size_];
+}
+
+void OCLPinnedMemory::runNoPrepinnedMemory() {
+  cl_int status;
+
+  row_data_t* tmp = new row_data_t[row_size_];
+  std::iota(tmp, tmp + row_size_, 0);
+  std::fill_n(host_memory_, pin_size_, 0);
+
+  cl_mem tmp_buffer = clCreateBuffer(context_, CL_MEM_USE_HOST_PTR,
+                                     row_size_ * row_data_size_, tmp, &status);
+  CHECK_ERROR(status, "clCreateBuffer failed.");
+  cl_mem buffer = clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                 row_size_ * row_data_size_, nullptr, &status);
+  CHECK_ERROR(status, "clCreateBuffer failed.");
+
+  status = clEnqueueCopyBuffer(cmdQueues_[_deviceId], tmp_buffer, buffer, 0, 0,
+                               row_size_ * row_data_size_, 0, nullptr, nullptr);
+  CHECK_ERROR(status, "clEnqueueCopyBuffer failed.");
+  clFinish(cmdQueues_[_deviceId]);
+
+  size_t buffer_offset[3] = {0, 0, 0};
+  size_t host_offset[3] = {0, 0, 0};
+  size_t region[3] = {row_data_size_, row_size_, 1};
+
+  status = clEnqueueReadBufferRect(
+      cmdQueues_[_deviceId], buffer, CL_TRUE, buffer_offset, host_offset,
+      region, 0, 0, row_size_, 0, host_memory_, 0, nullptr, nullptr);
+  CHECK_ERROR(status, "clEnqueueReadBufferRect failed.");
+  status = clFinish(cmdQueues_[_deviceId]);
+  CHECK_ERROR(status, "clFinish failed.");
+
+  for (uint64_t i = 0; i < row_size_; i++) {
+    if (tmp[i] != host_memory_[i * row_size_ / row_data_size_]) {
+      status = -1;
+      break;
+    }
+  }
+
+  CHECK_RESULT(status == -1, "Error when reading data.");
+
+  status = clReleaseMemObject(buffer);
+  CHECK_ERROR(status, "clReleaseMemObject failed.");
+  status = clReleaseMemObject(tmp_buffer);
+  CHECK_ERROR(status, "clReleaseMemObject failed.");
+  delete[] tmp;
+}
+
+void OCLPinnedMemory::runPrepinnedMemory() {
+  cl_int status;
+
+  row_data_t* tmp = new row_data_t[row_size_];
+  std::iota(tmp, tmp + row_size_, 0);
+  std::fill_n(host_memory_, pin_size_, 0);
+
+  cl_mem tmp_buffer = clCreateBuffer(context_, CL_MEM_USE_HOST_PTR,
+                                     row_size_ * row_data_size_, tmp, &status);
+  CHECK_ERROR(status, "clCreateBuffer failed.");
+  cl_mem buffer = clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                 row_size_ * row_data_size_, nullptr, &status);
+  CHECK_ERROR(status, "clCreateBuffer failed.");
+
+  status = clEnqueueCopyBuffer(cmdQueues_[_deviceId], tmp_buffer, buffer, 0, 0,
+                               row_size_ * row_data_size_, 0, nullptr, nullptr);
+  CHECK_ERROR(status, "clEnqueueCopyBuffer failed.");
+
+  cl_mem pinned_buffer =
+      clCreateBuffer(context_, CL_MEM_USE_HOST_PTR, pin_size_ * row_data_size_,
+                     host_memory_, &status);
+  CHECK_ERROR(status, "clCreateBuffer failed.");
+
+  clEnqueueMapBuffer(cmdQueues_[_deviceId], pinned_buffer, CL_TRUE,
+                     CL_MAP_READ | CL_MAP_WRITE, 0, pin_size_ * row_data_size_,
+                     0, nullptr, nullptr, &status);
+  CHECK_ERROR(status, "clEnqueueMapBuffer failed.");
+
+  size_t buffer_offset[3] = {0, 0, 0};
+  size_t host_offset[3] = {0, 0, 0};
+  size_t region[3] = {row_data_size_, row_size_, 1};
+
+  status = clEnqueueReadBufferRect(
+      cmdQueues_[_deviceId], buffer, CL_TRUE, buffer_offset, host_offset,
+      region, 0, 0, row_size_, 0, host_memory_, 0, nullptr, nullptr);
+  CHECK_ERROR(status, "clEnqueueReadBufferRect failed.");
+
+  for (uint64_t i = 0; i < row_size_; i++) {
+    if (tmp[i] != host_memory_[i * row_size_ / row_data_size_]) {
+      status = -1;
+      break;
+    }
+  }
+
+  CHECK_RESULT(status == -1, "Error when reading data.");
+
+  status = clEnqueueUnmapMemObject(cmdQueues_[_deviceId], pinned_buffer,
+                                   host_memory_, 0, nullptr, nullptr);
+  CHECK_ERROR(status, "clEnqueueUnmap failed.")
+  status = clFinish(cmdQueues_[_deviceId]);
+  CHECK_ERROR(status, "clFinish failed.");
+
+  status = clReleaseMemObject(pinned_buffer);
+  CHECK_ERROR(status, "clReleaseMemObject failed.");
+  status = clReleaseMemObject(buffer);
+  CHECK_ERROR(status, "clReleaseMemObject failed.");
+  status = clReleaseMemObject(tmp_buffer);
+  CHECK_ERROR(status, "clReleaseMemObject failed.");
+  delete[] tmp;
+}
+
+void OCLPinnedMemory::run() {
+  switch (_openTest) {
+    case 0:
+      runNoPrepinnedMemory();
+      break;
+    case 1:
+      runPrepinnedMemory();
+      break;
+  }
+}
+
+unsigned int OCLPinnedMemory::close() {
+  delete[] host_memory_;
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPinnedMemory.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPinnedMemory.h
new file mode 100644
index 0000000000..bc3d633b6e
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPinnedMemory.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PINNED_MEMORY_H_
+#define _OCL_PINNED_MEMORY_H_
+
+#include <cstdint>
+
+#include "OCLTestImp.h"
+
+class OCLPinnedMemory : public OCLTestImp {
+ public:
+  OCLPinnedMemory();
+  ~OCLPinnedMemory();
+
+  void open(unsigned int test, char* units, double& conversion,
+            unsigned int deviceId) override;
+  void run() override;
+  unsigned int close() override;
+
+ private:
+  void runNoPrepinnedMemory();
+  void runPrepinnedMemory();
+
+  static constexpr const float ratio_ = 0.4f;
+  using row_data_t = uint64_t;
+
+  row_data_t* host_memory_;
+  size_t row_data_size_ = sizeof(row_data_t);
+  size_t row_size_;
+  size_t pin_size_;
+};
+
+#endif  // _OCL_PINNED_MEMORY_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPlatformAtomics.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPlatformAtomics.cpp
new file mode 100644
index 0000000000..c1abadf7fb
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPlatformAtomics.cpp
@@ -0,0 +1,182 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLPlatformAtomics.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+const static char* strKernel =
+    "__kernel void test_atomic_kernel(volatile __global atomic_int *pSync, "
+    "volatile __global atomic_int *ptr, int numIterations)\n"
+    "{                                                                         "
+    "                                                 \n"
+    "   while(atomic_load_explicit(pSync,  memory_order_acquire, "
+    "memory_scope_all_svm_devices) == 0);                           \n"
+    "   for (int i = 0; i < numIterations; i++) {                              "
+    "                                                 \n"
+    "        atomic_fetch_add_explicit(ptr, 1, memory_order_acq_rel, "
+    "memory_scope_all_svm_devices);                             \n"
+    "   }                                                                      "
+    "                                                 \n"
+    "}                                                                         "
+    "                                                 \n";
+
+OCLPlatformAtomics::OCLPlatformAtomics() {
+  _numSubTests = 1;
+  failed_ = false;
+  svmCaps_ = 0;
+}
+
+OCLPlatformAtomics::~OCLPlatformAtomics() {}
+
+void OCLPlatformAtomics::open(unsigned int test, char* units,
+                              double& conversion, unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  size_t param_size = 0;
+  char* strVersion = 0;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
+                                     0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strVersion = new char[param_size];
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
+                                     param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strVersion[7] < '2') {
+    failed_ = true;
+    return;
+  }
+  delete strVersion;
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "test_atomic_kernel", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+}
+
+static int AtomicLoad(volatile cl_int* object) {
+#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
+  return InterlockedExchangeAdd((volatile long*)object, 0);
+#elif defined(__GNUC__)
+  return __sync_add_and_fetch(object, 0);
+#else
+  printf("Atomic load not supported, aborting...");
+  return 0;
+#endif
+}
+
+static int AtomicIncrement(volatile cl_int* object) {
+#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
+  return _InterlockedIncrement((volatile long*)object);
+#elif defined(__GNUC__)
+  return __sync_fetch_and_add(object, 1);
+#endif
+  printf("Atomic increment not supported, aborting...");
+  return 0;
+}
+
+void OCLPlatformAtomics::run(void) {
+  if (failed_) return;
+
+#ifdef CL_VERSION_2_0
+  error_ =
+      _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_SVM_CAPABILITIES,
+                                sizeof(svmCaps_), &svmCaps_, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clGetDeviceInfo()  failed");
+
+  if (!(svmCaps_ & CL_DEVICE_SVM_ATOMICS)) {
+    printf("SVM atomics not supported, skipping test...\n");
+    return;
+  }
+
+  volatile cl_int* pSyncBuf = (volatile cl_int*)_wrapper->clSVMAlloc(
+      context_, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS,
+      sizeof(cl_int), 0);
+  CHECK_RESULT(!pSyncBuf, "clSVMAlloc() failed");
+  *pSyncBuf = 0;
+
+  volatile cl_int* pAtomicBuf = (volatile cl_int*)_wrapper->clSVMAlloc(
+      context_, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS,
+      sizeof(cl_int), 0);
+  CHECK_RESULT(!pAtomicBuf, "clSVMAlloc() failed");
+  *pAtomicBuf = 0;
+
+  error_ =
+      _wrapper->clSetKernelArgSVMPointer(kernel_, 0, (const void*)pSyncBuf);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArgSVMPointer() failed");
+
+  error_ =
+      _wrapper->clSetKernelArgSVMPointer(kernel_, 1, (const void*)pAtomicBuf);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArgSVMPointer() failed");
+
+  cl_int numIterations = 0x100000;
+  error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_int), &numIterations);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  size_t globalWorkSize[1] = {1};
+  error_ =
+      _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL,
+                                       globalWorkSize, NULL, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+  clFlush(cmdQueues_[_deviceId]);
+
+  AtomicIncrement(pSyncBuf);
+
+  // wait until we see some activity from a device (try to run host side
+  // simultaneously).
+  while (AtomicLoad(pAtomicBuf /*, memory_order_relaxed*/) == 0)
+    ;
+
+  for (int i = 0; i < numIterations; i++) {
+    AtomicIncrement(pAtomicBuf);
+  }
+
+  error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
+  CHECK_ERROR(error_, "clFinish() failed");
+
+  int expected = numIterations * 2;
+  CHECK_RESULT(*pAtomicBuf != expected, "Expected: 0x%x, found: 0x%x", expected,
+               *pAtomicBuf);
+
+  _wrapper->clSVMFree(context_, (void*)pSyncBuf);
+  _wrapper->clSVMFree(context_, (void*)pAtomicBuf);
+#endif
+}
+
+unsigned int OCLPlatformAtomics::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPlatformAtomics.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPlatformAtomics.h
new file mode 100644
index 0000000000..c728fb6c36
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPlatformAtomics.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_PLATFORM_ATOMICS_H_
+#define _OCL_PLATFORM_ATOMICS_H_
+
+#include "OCLTestImp.h"
+
+class OCLPlatformAtomics : public OCLTestImp {
+ public:
+  OCLPlatformAtomics();
+  virtual ~OCLPlatformAtomics();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+  bool failed_;
+  unsigned long long svmCaps_;
+};
+
+#endif  // _OCL_KERNEL_BINARY_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLProgramScopeVariables.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLProgramScopeVariables.cpp
new file mode 100644
index 0000000000..4baf3db684
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLProgramScopeVariables.cpp
@@ -0,0 +1,274 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLProgramScopeVariables.h"
+
+#include "CL/cl.h"
+
+OCLProgramScopeVariables::OCLProgramScopeVariables() { _numSubTests = 3; }
+
+OCLProgramScopeVariables::~OCLProgramScopeVariables() {}
+
+void OCLProgramScopeVariables::open(unsigned int test, char* units,
+                                    double& conversion, unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "error_ opening test");
+  silentFailure = false;
+  _openTest = test;
+  size_t param_size = 0;
+  program_ = 0;
+  kernel1_ = kernel2_ = 0;
+  char* strVersion = 0;
+  error_ = _wrapper->clGetDeviceInfo(
+      devices_[_deviceId], CL_DEVICE_OPENCL_C_VERSION, 0, 0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+  strVersion = (char*)malloc(param_size);
+  error_ =
+      _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_OPENCL_C_VERSION,
+                                param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
+  if (strVersion[9] < '2') {
+    printf("\nOpenCL C 2.0 not supported\n");
+    silentFailure = true;
+  }
+  free(strVersion);
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLProgramScopeVariables::run(void) {
+  if (silentFailure) return;
+  switch (_openTest) {
+    case 0:
+      test0();
+      break;
+    case 1:
+      test1();
+      break;
+    case 2:
+      test2();
+      break;
+  }
+  return;
+}
+
+void OCLProgramScopeVariables::test0(void) {
+  const char* kernel_str =
+      "global int g[1000] = {0}; \n\
+        __kernel void test1 (global unsigned int * A) \n\
+        { \n\
+            int id = get_global_id(0);  \n\
+            g[id] = id; \n\
+        } \n\
+        __kernel void test2 (global unsigned int * A) \n\
+        { \n\
+            int id = get_global_id(0);  \n\
+            A[id] = g[id]; \n\
+        } \n";
+  const size_t arrSize = 1000;
+  cl_uint* output_arr = (cl_uint*)malloc(arrSize * sizeof(cl_uint));
+  cl_mem buffer = _wrapper->clCreateBuffer(
+      context_, CL_MEM_READ_WRITE, arrSize * sizeof(cl_uint), 0, &error_);
+  buffers_.push_back(buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed");
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char log[400];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 400, log, 0);
+    printf("\n\n%s\n\n", log);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed");
+  kernel1_ = _wrapper->clCreateKernel(program_, "test1", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel1 failed");
+  kernel2_ = _wrapper->clCreateKernel(program_, "test2", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel2 failed");
+  error_ = _wrapper->clSetKernelArg(kernel1_, 0, sizeof(cl_mem),
+                                    (void*)&buffers_[0]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed");
+  error_ = _wrapper->clSetKernelArg(kernel2_, 0, sizeof(cl_mem),
+                                    (void*)&buffers_[0]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed");
+  cl_event evt;
+  size_t global_work_size = arrSize;
+  error_ =
+      _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel1_, 1, NULL,
+                                       &global_work_size, NULL, 0, NULL, &evt);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  error_ =
+      _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel2_, 1, NULL,
+                                       &global_work_size, NULL, 0, NULL, &evt);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel");
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0],
+                                         CL_TRUE, 0, sizeof(cl_uint) * arrSize,
+                                         output_arr, 1, &evt, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed");
+  bool bResult = true;
+  for (unsigned int i = 0; i < arrSize; ++i) {
+    if (output_arr[i] != i) {
+      bResult = false;
+      break;
+    }
+  }
+  free(output_arr);
+  CHECK_RESULT((bResult == false), "Program Scope Variables - test0 failed");
+}
+
+void OCLProgramScopeVariables::test1(void) {
+  const char* kernel_str =
+      "global int temp = 0; \n\
+        __kernel void test1 (global unsigned int * A) \n\
+        { \n\
+            int id = get_global_id(0);  \n\
+            if (id == 0) temp = 55; \n\
+        } \n\
+        __kernel void test2 (global unsigned int * A) \n\
+        { \n\
+            int id = get_global_id(0);  \n\
+            if (id == 0) A[0] = temp; \n\
+        } \n";
+  cl_uint* output_arr = (cl_uint*)malloc(sizeof(cl_uint));
+  cl_mem buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                           sizeof(cl_uint), 0, &error_);
+  buffers_.push_back(buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed");
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char log[400];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 400, log, 0);
+    printf("\n\n%s\n\n", log);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed");
+  kernel1_ = _wrapper->clCreateKernel(program_, "test1", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel1 failed");
+  kernel2_ = _wrapper->clCreateKernel(program_, "test2", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel2 failed");
+  error_ = _wrapper->clSetKernelArg(kernel1_, 0, sizeof(cl_mem),
+                                    (void*)&buffers_[0]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed");
+  error_ = _wrapper->clSetKernelArg(kernel2_, 0, sizeof(cl_mem),
+                                    (void*)&buffers_[0]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed");
+  cl_event evt;
+  size_t global_work_size = 1;
+  error_ =
+      _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel1_, 1, NULL,
+                                       &global_work_size, NULL, 0, NULL, &evt);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  error_ =
+      _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel2_, 1, NULL,
+                                       &global_work_size, NULL, 0, NULL, &evt);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel");
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0],
+                                         CL_TRUE, 0, sizeof(cl_uint),
+                                         output_arr, 1, &evt, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed");
+  bool bResult = (output_arr[0] == 55);
+  free(output_arr);
+  CHECK_RESULT((bResult == false), "Program Scope Variables - test1 failed");
+}
+
+void OCLProgramScopeVariables::test2(void) {
+  const char* kernel_str =
+      "global int temp = 0; \n\
+        global int* ptr[] = {&temp}; \n\
+        __kernel void test1 (global unsigned int * A) \n\
+        { \n\
+            int id = get_global_id(0);  \n\
+            if (id == 0) temp = 65; \n\
+        } \n\
+        __kernel void test2 (global unsigned int * A) \n\
+        { \n\
+            int id = get_global_id(0);  \n\
+            if (id == 0) A[0] = *ptr[0]; \n\
+        } \n";
+  cl_uint* output_arr = (cl_uint*)malloc(sizeof(cl_uint));
+  cl_mem buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                           sizeof(cl_uint), 0, &error_);
+  buffers_.push_back(buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed");
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed");
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char log[400];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 400, log, 0);
+    printf("\n\n%s\n\n", log);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed");
+  kernel1_ = _wrapper->clCreateKernel(program_, "test1", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel1 failed");
+  kernel2_ = _wrapper->clCreateKernel(program_, "test2", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel2 failed");
+  error_ = _wrapper->clSetKernelArg(kernel1_, 0, sizeof(cl_mem),
+                                    (void*)&buffers_[0]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed");
+  error_ = _wrapper->clSetKernelArg(kernel2_, 0, sizeof(cl_mem),
+                                    (void*)&buffers_[0]);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed");
+  cl_event evt;
+  size_t global_work_size = 1;
+  error_ =
+      _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel1_, 1, NULL,
+                                       &global_work_size, NULL, 0, NULL, &evt);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+  error_ =
+      _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel2_, 1, NULL,
+                                       &global_work_size, NULL, 0, NULL, &evt);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel");
+  error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0],
+                                         CL_TRUE, 0, sizeof(cl_uint),
+                                         output_arr, 1, &evt, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed");
+  bool bResult = (output_arr[0] == 65);
+  free(output_arr);
+  CHECK_RESULT((bResult == false), "Program Scope Variables - test2 failed");
+}
+
+unsigned int OCLProgramScopeVariables::close(void) {
+  if (kernel1_) {
+    error_ = _wrapper->clReleaseKernel(kernel1_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel1 failed");
+    kernel1_ = 0;
+  }
+  if (kernel2_) {
+    error_ = _wrapper->clReleaseKernel(kernel2_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel2 failed");
+    kernel2_ = 0;
+  }
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLProgramScopeVariables.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLProgramScopeVariables.h
new file mode 100644
index 0000000000..e0dc0429dd
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLProgramScopeVariables.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_ProgramScopeVariables_H_
+#define _OCL_ProgramScopeVariables_H_
+
+#include "OCLTestImp.h"
+
+class OCLProgramScopeVariables : public OCLTestImp {
+ public:
+  OCLProgramScopeVariables();
+  virtual ~OCLProgramScopeVariables();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  void test0(void);
+  void test1(void);
+  void test2(void);
+  bool silentFailure;
+  cl_kernel kernel1_;
+  cl_kernel kernel2_;
+};
+
+#endif  // _OCL_ProgramScopeVariables_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLRTQueue.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLRTQueue.cpp
new file mode 100644
index 0000000000..73d1915309
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLRTQueue.cpp
@@ -0,0 +1,415 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLRTQueue.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+static const size_t Iterations = 0x100;
+static const size_t IterationDivider = 2;
+static const size_t MaxBuffers = IterationDivider;
+static const size_t BufSize = 0x800000;
+
+const static char* strKernel =
+    "__kernel void factorial(__global uint* out)                        \n"
+    "{                                                                  \n"
+    "   uint id = get_global_id(0);                                     \n"
+    "   uint factorial = 1;                                             \n"
+    "   for (uint i = 1; i < (id / 0x400); ++i)                         \n"
+    "   {                                                               \n"
+    "       factorial *= i;                                             \n"
+    "   }                                                               \n"
+    "    out[id] = factorial;                                            \n"
+    "}                                                                  \n";
+
+OCLRTQueue::OCLRTQueue() : rtQueue_(NULL), rtQueue1_(NULL), kernel2_(NULL) {
+#ifndef CL_VERSION_2_0
+  _numSubTests = 0;
+  testID_ = 0;
+  failed_ = false;
+#else
+  _numSubTests = 2;
+  testID_ = 0;
+  failed_ = false;
+#endif
+}
+
+OCLRTQueue::~OCLRTQueue() {}
+
+void OCLRTQueue::open(unsigned int test, char* units, double& conversion,
+                      unsigned int deviceId) {
+#ifdef CL_VERSION_2_0
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  testID_ = test;
+  size_t param_size = 0;
+  char* strVersion = 0;
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
+                                     0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strVersion = new char[param_size];
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
+                                     param_size, strVersion, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strVersion[7] < '2') {
+    failed_ = true;
+    return;
+  }
+  cl_uint rtQueues;
+#define CL_DEVICE_MAX_REAL_TIME_COMPUTE_QUEUES_AMD 0x404D
+#define CL_DEVICE_MAX_REAL_TIME_COMPUTE_UNITS_AMD 0x404E
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId],
+                                     CL_DEVICE_MAX_REAL_TIME_COMPUTE_QUEUES_AMD,
+                                     sizeof(rtQueues), &rtQueues, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (rtQueues < 2) {
+    failed_ = true;
+    return;
+  }
+
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId],
+                                     CL_DEVICE_MAX_REAL_TIME_COMPUTE_UNITS_AMD,
+                                     sizeof(rtCUs_), &rtCUs_, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId],
+                                     CL_DEVICE_MAX_COMPUTE_UNITS,
+                                     sizeof(maxCUs_), &maxCUs_, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "factorial", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+  for (size_t i = 0; i < MaxBuffers; ++i) {
+    buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                      BufSize * sizeof(cl_uint), NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+    buffers_.push_back(buffer);
+  }
+
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR,
+                                    BufSize * sizeof(cl_uint), NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+#endif
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLRTQueue::run(void) {
+#ifdef CL_VERSION_2_0
+  if (failed_) {
+    return;
+  }
+
+  if (testID_ == 0) {
+    cu_ = rtCUs_ >> 1;
+  } else {
+    cu_ = rtCUs_;
+  }
+  // Create a real time queue
+#define CL_QUEUE_REAL_TIME_COMPUTE_UNITS_AMD 0x404f
+  const cl_queue_properties cprops[] = {
+      CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(0),
+      CL_QUEUE_REAL_TIME_COMPUTE_UNITS_AMD, cu_, 0};
+  rtQueue_ = _wrapper->clCreateCommandQueueWithProperties(
+      context_, devices_[_deviceId], cprops, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateCommandQueueWithProperties() failed");
+
+#define CL_QUEUE_MEDIUM_PRIORITY_AMD 0x4050
+  const cl_queue_properties cprops2[] = {CL_QUEUE_PROPERTIES,
+                                         static_cast<cl_queue_properties>(0),
+                                         CL_QUEUE_MEDIUM_PRIORITY_AMD, 0, 0};
+  rtQueue1_ = _wrapper->clCreateCommandQueueWithProperties(
+      context_, devices_[_deviceId], cprops2, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clCreateCommandQueueWithProperties() failed");
+
+  void* values;
+  CPerfCounter timer;
+  cl_mem mapBuffer = buffers()[MaxBuffers];
+
+  values = _wrapper->clEnqueueMapBuffer(
+      cmdQueues_[_deviceId], mapBuffer, true, (CL_MAP_READ | CL_MAP_WRITE), 0,
+      BufSize * sizeof(cl_uint), 0, NULL, NULL, &error_);
+
+  cl_mem buffer = buffers()[0];
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  // SubTest: 1
+  size_t gws[1] = {BufSize};
+  size_t x;
+
+  error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, NULL, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  timer.Reset();
+  timer.Start();
+  for (x = 0; x < 1; x++) {
+    error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, gws, NULL, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  }
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  timer.Stop();
+
+  double sec = timer.GetElapsedTime();
+  // Buffer read bandwidth in GB/s
+  double perf = ((double)BufSize * sizeof(cl_uint) * x * (double)(1e-09)) / sec;
+
+  printf("\n Generic Queue(CUs: %d) Time:               %.3fs\n", maxCUs_, sec);
+
+  // SubTest: 2
+  error_ = _wrapper->clEnqueueNDRangeKernel(rtQueue_, kernel_, 1, NULL, gws,
+                                            NULL, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  _wrapper->clFinish(rtQueue_);
+
+  timer.Reset();
+  timer.Start();
+  for (x = 0; x < 1; x++) {
+    error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+    error_ = _wrapper->clEnqueueNDRangeKernel(rtQueue_, kernel_, 1, NULL, gws,
+                                              NULL, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  }
+  _wrapper->clFinish(rtQueue_);
+
+  timer.Stop();
+
+  sec = timer.GetElapsedTime();
+  // Buffer read bandwidth in GB/s
+  perf = ((double)BufSize * sizeof(cl_uint) * x * (double)(1e-09)) / sec;
+
+  printf(" RT Queue0 (CUs: %2d) Time:                  %.3fs\n", cu_, sec);
+
+  // SubTest: 2
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(rtQueue1_, kernel_, 1, NULL, gws,
+                                            NULL, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  _wrapper->clFinish(rtQueue1_);
+
+  timer.Reset();
+  timer.Start();
+  for (x = 0; x < 1; x++) {
+    error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+    error_ = _wrapper->clEnqueueNDRangeKernel(rtQueue1_, kernel_, 1, NULL, gws,
+                                              NULL, 0, NULL, NULL);
+
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  }
+  _wrapper->clFinish(rtQueue1_);
+
+  timer.Stop();
+
+  sec = timer.GetElapsedTime();
+  // Buffer read bandwidth in GB/s
+  perf = ((double)BufSize * sizeof(cl_uint) * x * (double)(1e-09)) / sec;
+
+  printf(" Medium Queue (CUs: %2d) Time:                  %.3fs\n",
+         maxCUs_ - cu_, sec);
+
+  // SubTest: 3
+  timer.Reset();
+  timer.Start();
+  for (x = 0; x < 1; x++) {
+    error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, gws, NULL, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  }
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  timer.Stop();
+
+  sec = timer.GetElapsedTime();
+  // Buffer read bandwidth in GB/s
+  perf = ((double)BufSize * sizeof(cl_uint) * x * (double)(1e-09)) / sec;
+
+  printf(" Generic Queue(CUs: %d) Time:               %.3fs\n", maxCUs_ - cu_,
+         sec);
+
+  // SubTest: 4
+  for (x = 0; x < Iterations / 10; x++) {
+    error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, gws, NULL, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  }
+  _wrapper->clFlush(cmdQueues_[_deviceId]);
+  timer.Reset();
+  timer.Start();
+  for (x = 0; x < 1; x++) {
+    error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+    error_ = _wrapper->clEnqueueNDRangeKernel(rtQueue_, kernel_, 1, NULL, gws,
+                                              NULL, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  }
+  _wrapper->clFinish(rtQueue_);
+
+  timer.Stop();
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  sec = timer.GetElapsedTime();
+  // Buffer read bandwidth in GB/s
+  perf = ((double)BufSize * sizeof(cl_uint) * x * (double)(1e-09)) / sec;
+
+  printf(" Async RT(CUs: %d) + Generic(CUs: %d) Time: %.3fs\n", cu_,
+         maxCUs_ - cu_, sec);
+
+  // SubTest: 5
+  for (x = 0; x < Iterations / 10; x++) {
+    error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, gws, NULL, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  }
+  _wrapper->clFlush(cmdQueues_[_deviceId]);
+  timer.Reset();
+  timer.Start();
+  for (x = 0; x < 1; x++) {
+    error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+    error_ = _wrapper->clEnqueueNDRangeKernel(rtQueue1_, kernel_, 1, NULL, gws,
+                                              NULL, 0, NULL, NULL);
+
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  }
+  _wrapper->clFinish(rtQueue1_);
+
+  timer.Stop();
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+
+  sec = timer.GetElapsedTime();
+  // Buffer read bandwidth in GB/s
+  perf = ((double)BufSize * sizeof(cl_uint) * x * (double)(1e-09)) / sec;
+
+  printf(" Async Medium(CUs: %d) + Generic(CUs: %d) Time: %.3fs\n",
+         maxCUs_ - cu_, maxCUs_ - cu_, sec);
+
+  // SubTest: 6
+  for (x = 0; x < Iterations / 10; x++) {
+    error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, gws, NULL, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  }
+  _wrapper->clFlush(cmdQueues_[_deviceId]);
+  timer.Reset();
+  timer.Start();
+  for (x = 0; x < 1; x++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(rtQueue_, kernel_, 1, NULL, gws,
+                                              NULL, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  }
+  _wrapper->clFlush(rtQueue_);
+  for (x = 0; x < 1; x++) {
+    error_ = _wrapper->clEnqueueNDRangeKernel(rtQueue1_, kernel_, 1, NULL, gws,
+                                              NULL, 0, NULL, NULL);
+
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  }
+
+  _wrapper->clFlush(rtQueue1_);
+  _wrapper->clFinish(rtQueue_);
+  _wrapper->clFinish(rtQueue1_);
+  timer.Stop();
+  _wrapper->clFlush(cmdQueues_[_deviceId]);
+
+  sec = timer.GetElapsedTime();
+  // Buffer read bandwidth in GB/s
+  perf = ((double)BufSize * sizeof(cl_uint) * x * (double)(1e-09)) / sec;
+
+  printf(
+      " Async RT0(CUs: %d) + Medium(CUs: %d) + Generic(CUs: %d) Time: %.3fs\n",
+      cu_, maxCUs_ - cu_, maxCUs_ - cu_, sec);
+  error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], mapBuffer,
+                                             values, 0, NULL, NULL);
+  _wrapper->clFinish(cmdQueues_[_deviceId]);
+#endif
+}
+
+unsigned int OCLRTQueue::close(void) {
+#ifdef CL_VERSION_2_0
+  if (NULL != rtQueue_) {
+    _wrapper->clReleaseCommandQueue(rtQueue_);
+  }
+  if (NULL != rtQueue1_) {
+    _wrapper->clReleaseCommandQueue(rtQueue1_);
+  }
+  if (NULL != kernel2_) {
+    _wrapper->clReleaseKernel(kernel2_);
+  }
+
+  return OCLTestImp::close();
+#else
+  return CL_SUCCESS;
+#endif
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLRTQueue.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLRTQueue.h
new file mode 100644
index 0000000000..b4f98dc5ae
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLRTQueue.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_RT_QUEUE_H_
+#define _OCL_RT_QUEUE_H_
+
+#include "OCLTestImp.h"
+
+class OCLRTQueue : public OCLTestImp {
+ public:
+  OCLRTQueue();
+  virtual ~OCLRTQueue();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  cl_command_queue rtQueue_;
+  cl_command_queue rtQueue1_;
+  cl_kernel kernel2_;
+  unsigned int testID_;
+  bool failed_;
+  cl_uint cu_;
+  cl_uint maxCUs_;
+  cl_uint rtCUs_;
+};
+
+#endif  // _OCL_RT_QUEUE_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLReadWriteImage.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLReadWriteImage.cpp
new file mode 100644
index 0000000000..02dc2be4ca
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLReadWriteImage.cpp
@@ -0,0 +1,372 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLReadWriteImage.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <sstream>
+#ifdef ATI_OS_LINUX
+#include <sys/sysinfo.h>
+#include <unistd.h>
+#endif
+
+#include "CL/cl.h"
+
+const static size_t imageSize = 4;
+const static size_t MaxSubTests = 4;
+
+static const char *rgba8888_kernel_read =
+    "\n"
+    "__kernel void read_rgba8888(read_only image2d_t srcimg, __global uchar4 "
+    "*dst, sampler_t sampler)\n"
+    "{\n"
+    "    int    tid_x = get_global_id(0);\n"
+    "    int    tid_y = get_global_id(1);\n"
+    "    int    indx = tid_y * get_image_width(srcimg) + tid_x;\n"
+    "    float4 color;\n"
+    "\n"
+    "    color = read_imagef(srcimg, sampler, (int2)(tid_x, tid_y)) * 255.0f;\n"
+    "    dst[indx] = convert_uchar4_rte(color);\n"
+    "\n"
+    "}\n";
+
+static const char *rgba8888_kernel_write =
+    "\n"
+    "__kernel void write_rgba8888(__global unsigned char *src, write_only "
+    "image2d_t dstimg)\n"
+    "{\n"
+    "    int            tid_x = get_global_id(0);\n"
+    "    int            tid_y = get_global_id(1);\n"
+    "    int            indx = tid_y * get_image_width(dstimg) + tid_x;\n"
+    "    float4         color;\n"
+    "\n"
+    "    indx *= 4;\n"
+    "    color = (float4)((float)src[indx+0], (float)src[indx+1], "
+    "(float)src[indx+2], (float)src[indx+3]);\n"
+    "    color /= (float4)(255.0f, 255.0f, 255.0f, 255.0f);\n"
+    "    write_imagef(dstimg, (int2)(tid_x, tid_y), color);\n"
+    "\n"
+    "}\n";
+
+OCLReadWriteImage::OCLReadWriteImage() {
+  _numSubTests = MaxSubTests;
+  failed_ = false;
+  imageWidth = imageSize;
+  imageHeight = imageSize;
+  imageDepth = imageSize;
+}
+
+OCLReadWriteImage::~OCLReadWriteImage() {}
+
+bool OCLReadWriteImage::verifyImageData(unsigned char *inputImageData,
+                                        unsigned char *output, size_t width,
+                                        size_t height) {
+  for (unsigned int i = 0; i < 4 * width * height; i++) {
+    if (output[i] != inputImageData[i]) {
+      printf(
+          "Verification failed at byte %u in the output image => %x != %x "
+          "[reference]\n",
+          i, output[i], inputImageData[i]);
+      return false;
+    }
+  }
+  return true;
+}
+void OCLReadWriteImage::open(unsigned int test, char *units, double &conversion,
+                             unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+  testID_ = test;
+
+  cl_bool imageSupport;
+  size_t size;
+  for (size_t i = 0; i < deviceCount_; ++i) {
+    _wrapper->clGetDeviceInfo(devices_[i], CL_DEVICE_IMAGE_SUPPORT,
+                              sizeof(imageSupport), &imageSupport, &size);
+    if (!imageSupport) {
+      failed_ = true;
+      return;
+    }
+  }
+
+#ifdef ATI_OS_LINUX
+  failed_ = true;
+  return;
+#endif
+  if (test == 1) {
+    program_ = _wrapper->clCreateProgramWithSource(
+        context_, 1, &rgba8888_kernel_read, NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+    error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL,
+                                      NULL, NULL);
+    if (error_ != CL_SUCCESS) {
+      char programLog[1024];
+      _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                      CL_PROGRAM_BUILD_LOG, 1024, programLog,
+                                      0);
+      printf("\n%s\n", programLog);
+      fflush(stdout);
+    }
+    CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+    kernel_ = _wrapper->clCreateKernel(program_, "read_rgba8888", &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+  } else if ((test == 2) || (test == 3)) {
+    program_ = _wrapper->clCreateProgramWithSource(
+        context_, 1, &rgba8888_kernel_write, NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+    error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL,
+                                      NULL, NULL);
+    if (error_ != CL_SUCCESS) {
+      char programLog[1024];
+      _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                      CL_PROGRAM_BUILD_LOG, 1024, programLog,
+                                      0);
+      printf("\n%s\n", programLog);
+      fflush(stdout);
+    }
+    CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+    kernel_ = _wrapper->clCreateKernel(program_, "write_rgba8888", &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+  }
+
+  cl_mem memory;
+  cl_image_format imgageFormat;
+  imgageFormat.image_channel_order = CL_RGBA;
+  imgageFormat.image_channel_data_type = CL_UNORM_INT8;
+  bufferSize = imageWidth * imageHeight * 4 * sizeof(unsigned char);
+
+  memory = _wrapper->clCreateImage2D(context_, CL_MEM_READ_WRITE, &imgageFormat,
+                                     imageWidth, imageHeight, 0, NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateImage() failed");
+
+  buffers_.push_back(memory);
+
+  if ((test == 1) || (test == 2) || (test == 3)) {
+    memory = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, bufferSize,
+                                      NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+    buffers_.push_back(memory);
+  }
+}
+
+static void CL_CALLBACK notify_callback(const char *errinfo,
+                                        const void *private_info, size_t cb,
+                                        void *user_data) {}
+
+void OCLReadWriteImage::run(void) {
+  if (failed_) {
+    return;
+  }
+
+  const unsigned int inputImageData[imageSize][imageSize] = {
+      {0xc0752fac, 0x67c3fb43, 0xf215d309, 0xd8465724},
+      {0xc13a8c58, 0xae5727e6, 0x19a55158, 0x9409484d},
+      {0xc5f3d073, 0xc0af4ffe, 0xb1d86352, 0x93931df3},
+      {0xc120a78e, 0x207fb909, 0x97f4ca1f, 0x72cbfea3}};
+
+  unsigned char *outputPtr = (unsigned char *)malloc(bufferSize);
+
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {imageWidth, imageHeight, 1};
+  bool validation;
+  size_t threads[2];
+
+  switch (testID_) {
+    case 0:  // ImageWrite (w/ sDMA) and ImageRead (w/ sDMA)
+      error_ = _wrapper->clEnqueueWriteImage(cmdQueues_[_deviceId], buffers_[0],
+                                             true, origin, region, 0, 0,
+                                             inputImageData, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteImage() failed");
+
+      error_ = _wrapper->clEnqueueReadImage(cmdQueues_[_deviceId], buffers_[0],
+                                            true, origin, region, 0, 0,
+                                            outputPtr, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadImage() failed");
+
+      validation = verifyImageData((unsigned char *)&inputImageData, outputPtr,
+                                   imageWidth, imageHeight);
+      if (validation) {
+        printf("ImageWrite (w/ sDMA)   -> ImageRead (w/ sDMA)   passed!\n");
+      } else {
+        CHECK_RESULT(true,
+                     "ImageWrite (w/ sDMA) -> ImageRead (w/ sDMA) failed!\n");
+      }
+      break;
+    case 1:  // ImageWrite (w/ sDMA) and ImageRead (w/ kernel)
+      error_ = _wrapper->clEnqueueWriteImage(cmdQueues_[_deviceId], buffers_[0],
+                                             true, origin, region, 0, 0,
+                                             inputImageData, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteImage() failed");
+
+      cl_sampler sampler;
+      sampler = _wrapper->clCreateSampler(context_, CL_FALSE,
+                                          CL_ADDRESS_CLAMP_TO_EDGE,
+                                          CL_FILTER_NEAREST, &error_);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clCreateSampler failed");
+
+      error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof buffers_[0],
+                                        &buffers_[0]);
+      error_ |= clSetKernelArg(kernel_, 1, sizeof buffers_[1], &buffers_[1]);
+      error_ |= clSetKernelArg(kernel_, 2, sizeof sampler, &sampler);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed\n");
+
+      threads[0] = (unsigned int)imageWidth;
+      threads[1] = (unsigned int)imageHeight;
+
+      error_ =
+          _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2,
+                                           NULL, threads, NULL, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+      error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[1],
+                                             CL_TRUE, 0, bufferSize, outputPtr,
+                                             0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed");
+
+      validation = verifyImageData((unsigned char *)&inputImageData, outputPtr,
+                                   imageWidth, imageHeight);
+      if (validation) {
+        printf("ImageWrite (w/ sDMA)   -> ImageRead (w/ kernel) passed!\n");
+      } else {
+        CHECK_RESULT(true,
+                     "ImageWrite (w/ sDMA) -> ImageRead (w/ kernel) failed!\n");
+      }
+
+      break;
+    case 2:  // ImageWrite (w/ kernel) and ImageRead (w/ sDMA)
+      error_ = _wrapper->clEnqueueWriteBuffer(
+          cmdQueues_[_deviceId], buffers_[1], CL_TRUE, 0, bufferSize,
+          inputImageData, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+
+      error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof buffers_[1],
+                                        &buffers_[1]);
+      error_ |= clSetKernelArg(kernel_, 1, sizeof buffers_[0], &buffers_[0]);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed\n");
+
+      threads[0] = (unsigned int)imageWidth;
+      threads[1] = (unsigned int)imageHeight;
+
+      error_ =
+          _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2,
+                                           NULL, threads, NULL, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+      error_ = _wrapper->clEnqueueReadImage(cmdQueues_[_deviceId], buffers_[0],
+                                            true, origin, region, 0, 0,
+                                            outputPtr, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadImage() failed");
+
+      validation = verifyImageData((unsigned char *)&inputImageData, outputPtr,
+                                   imageWidth, imageHeight);
+      if (validation) {
+        printf("ImageWrite (w/ kernel) -> ImageRead (w/ sDMA)   passed!\n");
+      } else {
+        CHECK_RESULT(true,
+                     "ImageWrite (w/ kernel) -> ImageRead (w/ sDMA) failed!\n");
+      }
+      break;
+    case 3:  // ImageWrite (w/ kernel) and ImageRead (w/ kernel)
+      error_ = _wrapper->clEnqueueWriteBuffer(
+          cmdQueues_[_deviceId], buffers_[1], CL_TRUE, 0, bufferSize,
+          inputImageData, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+
+      error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof buffers_[1],
+                                        &buffers_[1]);
+      error_ |= clSetKernelArg(kernel_, 1, sizeof buffers_[0], &buffers_[0]);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed\n");
+
+      threads[0] = (unsigned int)imageWidth;
+      threads[1] = (unsigned int)imageHeight;
+
+      error_ =
+          _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2,
+                                           NULL, threads, NULL, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+      // recreate the program_ to use the read kernel
+      program_ = _wrapper->clCreateProgramWithSource(
+          context_, 1, &rgba8888_kernel_read, NULL, &error_);
+      CHECK_RESULT((error_ != CL_SUCCESS),
+                   "clCreateProgramWithSource()  failed");
+
+      error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL,
+                                        NULL, NULL);
+      if (error_ != CL_SUCCESS) {
+        char programLog[1024];
+        _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId],
+                                        CL_PROGRAM_BUILD_LOG, 1024, programLog,
+                                        0);
+        printf("\n%s\n", programLog);
+        fflush(stdout);
+      }
+      CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+      kernel_ = _wrapper->clCreateKernel(program_, "read_rgba8888", &error_);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+      sampler = _wrapper->clCreateSampler(context_, CL_FALSE,
+                                          CL_ADDRESS_CLAMP_TO_EDGE,
+                                          CL_FILTER_NEAREST, &error_);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clCreateSampler failed");
+
+      error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof buffers_[0],
+                                        &buffers_[0]);
+      error_ |= clSetKernelArg(kernel_, 1, sizeof buffers_[1], &buffers_[1]);
+      error_ |= clSetKernelArg(kernel_, 2, sizeof sampler, &sampler);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed\n");
+
+      threads[0] = (unsigned int)imageWidth;
+      threads[1] = (unsigned int)imageHeight;
+
+      error_ =
+          _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2,
+                                           NULL, threads, NULL, 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+      error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[1],
+                                             CL_TRUE, 0, bufferSize, outputPtr,
+                                             0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed");
+
+      validation = verifyImageData((unsigned char *)&inputImageData, outputPtr,
+                                   imageWidth, imageHeight);
+      if (validation) {
+        printf("ImageWrite (w/ kernel) -> ImageRead (w/ kernel) passed!\n");
+      } else {
+        CHECK_RESULT(
+            true, "ImageWrite (w/ kernel) -> ImageRead (w/ kernel) failed!\n");
+      }
+
+      break;
+  }
+
+  free(outputPtr);
+}
+
+unsigned int OCLReadWriteImage::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLReadWriteImage.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLReadWriteImage.h
new file mode 100644
index 0000000000..c22bc51b93
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLReadWriteImage.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_READ_WRITE_IMAGE_H_
+#define _OCL_READ_WRITE_IMAGE_H_
+
+#include "OCLTestImp.h"
+
+class OCLReadWriteImage : public OCLTestImp {
+ public:
+  OCLReadWriteImage();
+  virtual ~OCLReadWriteImage();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+  unsigned int testID_;
+  size_t maxSize_;
+  size_t imageWidth;
+  size_t imageHeight;
+  size_t imageDepth;
+  size_t bufferSize;
+  cl_sampler sampler;
+  bool verifyImageData(unsigned char* inputImageData, unsigned char* output,
+                       size_t width, size_t height);
+};
+
+#endif  // _OCL_READ_WRITE_IMAGE_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLSDI.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSDI.cpp
new file mode 100644
index 0000000000..f0081727cc
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSDI.cpp
@@ -0,0 +1,515 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLSDI.h"
+
+#include "Timer.h"
+#define NUM_TESTS 6
+
+#include <cmath>
+
+typedef struct _threadInfo {
+  int threadID_;
+  OCLSDI* testObj_;
+} ThreadInfo;
+const char* kernel_str_ =
+    "__kernel void test_kernel(global unsigned int * A) \
+			   { \
+					int id = get_global_id(0);  \
+                    A[id] = id + 2;\
+			   } ";
+const char* testNames[NUM_TESTS] = {
+    "WriteBuffer", "CopyBuffer",      "NDRangeKernel",
+    "MapBuffer",   "WriteBufferRect", "CopyImageToBuffer",
+};
+
+void* ThreadMain(void* data) {
+  if (data == NULL) {
+    return 0;
+  }
+  ThreadInfo* threadData = (ThreadInfo*)data;
+  threadData->testObj_->threadEntry(threadData->threadID_);
+  return NULL;
+}
+
+OCLSDI::OCLSDI() {
+  // If there are two different gpus in the system,
+  // we have to test each of them as sender and receiver
+  _numSubTests = 2 * NUM_TESTS;
+}
+
+OCLSDI::~OCLSDI() {}
+
+void OCLSDI::open(unsigned int test, char* units, double& conversion,
+                  unsigned int deviceId) {
+  cl_uint numPlatforms = 0;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  _crcword = 0;
+  conversion = 1.0f;
+  program_ = 0;
+  kernel_ = 0;
+  srcBuff_ = 0;
+  _openTest = test % NUM_TESTS;
+  bufSize_ = 0x10000;
+  error_ = 0;
+  markerValue_ = 0x12345;
+  inputArr_ = 0;
+  outputArr_ = 0;
+  success_ = true;
+  extPhysicalBuff_ = 0;
+  silentFailure = false;
+  busAddressableBuff_ = 0;
+  devices_[0] = devices_[1] = 0;
+  contexts_[0] = contexts_[1] = 0;
+  cmd_queues_[0] = cmd_queues_[1] = 0;
+  image_ = 0;
+
+  inputArr_ = (cl_uint*)malloc(bufSize_);
+  outputArr_ = (cl_uint*)malloc(bufSize_);
+  for (unsigned int i = 0; i < (bufSize_ / sizeof(cl_uint)); ++i) {
+    inputArr_[i] = i + 1;
+    outputArr_[i] = 0;
+  }
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(numPlatforms == 0, "clGetPlatformIDs failed");
+  error_ = _wrapper->clGetPlatformIDs(1, &platform, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  error_ = _wrapper->clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL,
+                                    &num_devices);
+  if (num_devices < 2) {
+    printf("\nSilent Failure: Two GPUs are required to run OCLSdi test\n");
+    silentFailure = true;
+    return;
+  }
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 2, devices_, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+  if (test >= NUM_TESTS) {
+    cl_device_id temp = devices_[0];
+    devices_[0] = devices_[1];
+    devices_[1] = temp;
+  }
+  size_t param_size = 0;
+  char* strExtensions = 0;
+  error_ = _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_EXTENSIONS, 0, 0,
+                                     &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strExtensions = (char*)malloc(param_size);
+  error_ = _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_EXTENSIONS,
+                                     param_size, strExtensions, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strstr(strExtensions, "cl_amd_bus_addressable_memory") == 0) {
+    printf(
+        "\nSilent Failure: cl_amd_bus_addressable_memory extension is not "
+        "enabled on GPU 0\n");
+    silentFailure = true;
+    free(strExtensions);
+    return;
+  }
+  free(strExtensions);
+  error_ = _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_EXTENSIONS, 0, 0,
+                                     &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strExtensions = (char*)malloc(param_size);
+  error_ = _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_EXTENSIONS,
+                                     param_size, strExtensions, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  if (strstr(strExtensions, "cl_amd_bus_addressable_memory") == 0) {
+    printf(
+        "\nSilent Failure: cl_amd_bus_addressable_memory extension is not "
+        "enabled on GPU 1\n");
+    silentFailure = true;
+    free(strExtensions);
+    return;
+  }
+  free(strExtensions);
+  deviceNames_ = " [";
+  param_size = 0;
+  char* strDeviceName = 0;
+  error_ =
+      _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_NAME, 0, 0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strDeviceName = (char*)malloc(param_size);
+  error_ = _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_NAME, param_size,
+                                     strDeviceName, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  deviceNames_ = deviceNames_ + strDeviceName;
+  free(strDeviceName);
+  error_ =
+      _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_NAME, 0, 0, &param_size);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  strDeviceName = (char*)malloc(param_size);
+  error_ = _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_NAME, param_size,
+                                     strDeviceName, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+  deviceNames_ = deviceNames_ + "->";
+  deviceNames_ = deviceNames_ + strDeviceName;
+  free(strDeviceName);
+  deviceNames_ = deviceNames_ + "]";
+  cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)platform, 0};
+  contexts_[0] =
+      _wrapper->clCreateContext(props, 1, &devices_[0], 0, 0, &error_);
+  CHECK_RESULT(contexts_[0] == 0, "clCreateContext failed");
+  contexts_[1] =
+      _wrapper->clCreateContext(props, 1, &devices_[1], 0, 0, &error_);
+  CHECK_RESULT(contexts_[1] == 0, "clCreateContext failed");
+  cmd_queues_[0] =
+      _wrapper->clCreateCommandQueue(contexts_[0], devices_[0], 0, NULL);
+  CHECK_RESULT(cmd_queues_[0] == 0, "clCreateCommandQueue failed");
+  cmd_queues_[1] =
+      _wrapper->clCreateCommandQueue(contexts_[1], devices_[1], 0, NULL);
+  CHECK_RESULT(cmd_queues_[1] == 0, "clCreateCommandQueue failed");
+  busAddressableBuff_ = _wrapper->clCreateBuffer(
+      contexts_[0], CL_MEM_BUS_ADDRESSABLE_AMD, bufSize_, 0, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed");
+  error_ = _wrapper->clEnqueueMakeBuffersResidentAMD(
+      cmd_queues_[0], 1, &busAddressableBuff_, true, &busAddr_, 0, 0, 0);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clEnqueueMakeBuffersResidentAMD failed");
+  extPhysicalBuff_ = _wrapper->clCreateBuffer(
+      contexts_[1], CL_MEM_EXTERNAL_PHYSICAL_AMD, bufSize_, &busAddr_, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed");
+  error_ = _wrapper->clEnqueueWriteSignalAMD(cmd_queues_[1], extPhysicalBuff_,
+                                             0, 0, 0, 0, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteSignalAMD failed");
+  error_ = _wrapper->clFinish(cmd_queues_[1]);
+  CHECK_RESULT(error_, "clFinish failed");
+  srcBuff_ = _wrapper->clCreateBuffer(contexts_[1],
+                                      CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                                      bufSize_, inputArr_, &error_);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clCreateBuffer failed");
+  error_ = _wrapper->clEnqueueMigrateMemObjects(cmd_queues_[1], 1,
+                                                &extPhysicalBuff_, 0, 0, 0, 0);
+  CHECK_RESULT(error_, "clEnqueueMigrateMemObjects failed");
+  error_ = _wrapper->clFinish(cmd_queues_[1]);
+  CHECK_RESULT(error_, "clFinish failed");
+  error_ = _wrapper->clEnqueueMigrateMemObjects(cmd_queues_[1], 1, &srcBuff_, 0,
+                                                0, 0, 0);
+  CHECK_RESULT(error_, "clEnqueueMigrateMemObjects failed");
+  error_ = _wrapper->clFinish(cmd_queues_[1]);
+  CHECK_RESULT(error_, "clFinish failed");
+  if (_openTest == 2) {
+    program_ = _wrapper->clCreateProgramWithSource(contexts_[1], 1,
+                                                   &kernel_str_, NULL, &error_);
+    CHECK_RESULT(error_, "clCreateProgramWithSource failed");
+    error_ =
+        _wrapper->clBuildProgram(program_, 1, &devices_[1], NULL, NULL, NULL);
+    if (error_ != CL_SUCCESS) {
+      char* errorstr;
+      size_t size;
+      _wrapper->clGetProgramBuildInfo(program_, devices_[1],
+                                      CL_PROGRAM_BUILD_LOG, 0, NULL, &size);
+      errorstr = new char[size];
+      _wrapper->clGetProgramBuildInfo(
+          program_, devices_[1], CL_PROGRAM_BUILD_LOG, size, errorstr, &size);
+      printf("\n%s\n", errorstr);
+      delete[] errorstr;
+    }
+    CHECK_RESULT(error_, "clBuildProgram failed");
+
+    kernel_ = _wrapper->clCreateKernel(program_, "test_kernel", &error_);
+    CHECK_RESULT(error_, "clCreateKernel failed");
+    error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
+                                      (void*)&extPhysicalBuff_);
+    CHECK_RESULT(error_, "clSetKernelArg failed");
+  }
+  if (_openTest == 5) {
+    cl_image_format format = {CL_R, CL_UNSIGNED_INT32};
+    cl_image_desc desc;
+    desc.image_type = CL_MEM_OBJECT_IMAGE1D;
+    desc.image_width = bufSize_ / sizeof(cl_uint);
+    desc.image_height = 0;
+    desc.image_depth = 0;
+    desc.image_array_size = 0;
+    desc.image_row_pitch = 0;
+    desc.image_slice_pitch = 0;
+    desc.num_mip_levels = 0;
+    desc.num_samples = 0;
+    desc.buffer = (cl_mem)NULL;
+    image_ = _wrapper->clCreateImage(contexts_[1], CL_MEM_READ_ONLY, &format,
+                                     &desc, 0, &error_);
+    CHECK_RESULT(error_, "clCreateImage failed");
+  }
+}
+
+void OCLSDI::run(void) {
+  if (silentFailure) {
+    return;
+  }
+  ++markerValue_;
+  OCLutil::Thread threads[2];
+  ThreadInfo threadInfo[2];
+  threadInfo[0].testObj_ = threadInfo[1].testObj_ = this;
+  threadInfo[0].threadID_ = 0;
+  threadInfo[1].threadID_ = 1;
+  threads[0].create(ThreadMain, &threadInfo[0]);
+  threads[1].create(ThreadMain, &threadInfo[1]);
+  threads[0].join();
+  threads[1].join();
+  char* descString = (char*)malloc(25 + deviceNames_.size());
+  sprintf(descString, "%-20s%s", testNames[_openTest], deviceNames_.c_str());
+  testDescString = descString;
+  free(descString);
+  if (!success_) {
+    _errorFlag = true;
+    _crcword += 1;
+  }
+}
+
+void OCLSDI::threadEntry(int threadID) {
+  if (silentFailure) {
+    return;
+  }
+  switch (_openTest) {
+    case 0:
+      testEnqueueWriteBuffer(threadID);
+      break;
+    case 1:
+      testEnqueueCopyBuffer(threadID);
+      break;
+    case 2:
+      testEnqueueNDRangeKernel(threadID);
+      break;
+    case 3:
+      testEnqueueMapBuffer(threadID);
+      break;
+    case 4:
+      testEnqueueWriteBufferRect(threadID);
+      break;
+    case 5:
+      testEnqueueCopyImageToBuffer(threadID);
+      break;
+  }
+}
+
+unsigned int OCLSDI::close(void) {
+  if (srcBuff_) {
+    error_ = _wrapper->clReleaseMemObject(srcBuff_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject failed");
+  }
+  if (extPhysicalBuff_) {
+    error_ = _wrapper->clReleaseMemObject(extPhysicalBuff_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject failed");
+  }
+  if (busAddressableBuff_) {
+    error_ = _wrapper->clReleaseMemObject(busAddressableBuff_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject failed");
+  }
+  if (cmd_queues_[0]) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queues_[0]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (cmd_queues_[1]) {
+    error_ = _wrapper->clReleaseCommandQueue(cmd_queues_[1]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
+                           "clReleaseCommandQueue failed");
+  }
+  if (contexts_[0]) {
+    error_ = _wrapper->clReleaseContext(contexts_[0]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+  if (contexts_[1]) {
+    error_ = _wrapper->clReleaseContext(contexts_[1]);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
+  }
+  if (program_) {
+    error_ = _wrapper->clReleaseProgram(program_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
+  }
+  if (kernel_) {
+    error_ = _wrapper->clReleaseKernel(kernel_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
+  }
+  if (image_) {
+    error_ = _wrapper->clReleaseMemObject(image_);
+    CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject failed");
+  }
+  if (inputArr_) {
+    free(inputArr_);
+  }
+  if (outputArr_) {
+    free(outputArr_);
+  }
+  return _crcword;
+}
+
+void OCLSDI::readAndVerifyResult() {
+  memset(outputArr_, 0, bufSize_);
+  error_ = _wrapper->clEnqueueWaitSignalAMD(cmd_queues_[0], busAddressableBuff_,
+                                            markerValue_, 0, 0, 0);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWaitSignalAMD failed");
+  error_ = _wrapper->clEnqueueReadBuffer(cmd_queues_[0], busAddressableBuff_,
+                                         CL_TRUE, 0, bufSize_, outputArr_, 0, 0,
+                                         NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueReadBuffer failed");
+  success_ = (memcmp(inputArr_, outputArr_, bufSize_) == 0);
+}
+
+void OCLSDI::testEnqueueCopyImageToBuffer(int threadID) {
+  if (threadID == 0) {
+    size_t origin[3] = {0, 0, 0};
+    size_t region[3] = {bufSize_ / sizeof(cl_uint), 1, 1};
+    memset(inputArr_, (_openTest + 1), bufSize_);
+    error_ =
+        _wrapper->clEnqueueWriteImage(cmd_queues_[1], image_, CL_TRUE, origin,
+                                      region, 0, 0, inputArr_, 0, 0, 0);
+    CHECK_RESULT(error_, "clEnqueueWriteImage failed");
+    _wrapper->clFinish(cmd_queues_[1]);
+    error_ = _wrapper->clEnqueueCopyImageToBuffer(
+        cmd_queues_[1], image_, extPhysicalBuff_, origin, region, 0, 0, 0, 0);
+    CHECK_RESULT(error_, "clEnqueueCopyImageToBuffer failed");
+    _wrapper->clFinish(cmd_queues_[1]);
+    error_ = _wrapper->clEnqueueWriteSignalAMD(cmd_queues_[1], extPhysicalBuff_,
+                                               markerValue_, 0, 0, 0, 0);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteSignalAMD failed");
+    error_ = _wrapper->clFinish(cmd_queues_[1]);
+    CHECK_RESULT(error_, "clFinish failed");
+  } else {
+    readAndVerifyResult();
+  }
+}
+
+void OCLSDI::testEnqueueWriteBufferRect(int threadID) {
+  size_t width = (size_t)sqrt((float)bufSize_);
+  size_t bufOrigin[3] = {0, 0, 0};
+  size_t hostOrigin[3] = {0, 0, 0};
+  size_t region[3] = {width, width, 1};
+  if (threadID == 0) {
+    memset(inputArr_, (_openTest + 1), bufSize_);
+    error_ = _wrapper->clEnqueueWriteBufferRect(
+        cmd_queues_[1], extPhysicalBuff_, CL_TRUE, bufOrigin, hostOrigin,
+        region, width, 0, width, 0, inputArr_, 0, 0, 0);
+    CHECK_RESULT(error_, "clEnqueueWriteBufferRect failed");
+    error_ = _wrapper->clEnqueueWriteSignalAMD(cmd_queues_[1], extPhysicalBuff_,
+                                               markerValue_, 0, 0, 0, 0);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteSignalAMD failed");
+    error_ = _wrapper->clFinish(cmd_queues_[1]);
+    CHECK_RESULT(error_, "clFinish failed");
+  } else {
+    memset(outputArr_, 0, bufSize_);
+    error_ = _wrapper->clEnqueueWaitSignalAMD(
+        cmd_queues_[0], busAddressableBuff_, markerValue_, 0, 0, 0);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWaitSignalAMD failed");
+    error_ = _wrapper->clEnqueueReadBufferRect(
+        cmd_queues_[0], busAddressableBuff_, CL_TRUE, bufOrigin, hostOrigin,
+        region, width, 0, width, 0, outputArr_, 0, 0, 0);
+    CHECK_RESULT(error_, "clEnqueueReadBufferRect failed");
+    success_ = (memcmp(inputArr_, outputArr_, bufSize_) == 0);
+  }
+}
+
+void OCLSDI::testEnqueueMapBuffer(int threadID) {
+  if (threadID == 0) {
+    memset(inputArr_, (_openTest + 1), bufSize_);
+    error_ = _wrapper->clEnqueueWriteBuffer(cmd_queues_[1], extPhysicalBuff_,
+                                            CL_TRUE, 0, bufSize_, inputArr_, 0,
+                                            0, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteBuffer failed");
+    error_ = _wrapper->clEnqueueWriteSignalAMD(cmd_queues_[1], extPhysicalBuff_,
+                                               markerValue_, 0, 0, 0, 0);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteSignalAMD failed");
+    error_ = _wrapper->clFinish(cmd_queues_[1]);
+    CHECK_RESULT(error_, "clFinish failed");
+  } else {
+    error_ = _wrapper->clEnqueueWaitSignalAMD(
+        cmd_queues_[0], busAddressableBuff_, markerValue_, 0, 0, 0);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWaitSignalAMD failed");
+    void* ptr = _wrapper->clEnqueueMapBuffer(
+        cmd_queues_[0], busAddressableBuff_, CL_TRUE, CL_MAP_READ, 0, bufSize_,
+        0, 0, 0, &error_);
+    CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
+    success_ = (memcmp(inputArr_, ptr, bufSize_) == 0);
+    error_ = _wrapper->clEnqueueUnmapMemObject(
+        cmd_queues_[0], busAddressableBuff_, ptr, 0, 0, 0);
+    CHECK_RESULT(error_, "clEnqueueUnmapMemObject failed");
+    error_ = _wrapper->clFinish(cmd_queues_[0]);
+    CHECK_RESULT(error_, "clFinish failed");
+  }
+}
+
+void OCLSDI::testEnqueueNDRangeKernel(int threadID) {
+  if (threadID == 0) {
+    size_t global_work_size = bufSize_ / sizeof(cl_uint);
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmd_queues_[1], kernel_, 1, NULL,
+                                              &global_work_size, NULL, 0, NULL,
+                                              NULL);
+    CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
+    error_ = _wrapper->clFinish(cmd_queues_[1]);
+    CHECK_RESULT(error_, "clFinish failed");
+    error_ = _wrapper->clEnqueueWriteSignalAMD(cmd_queues_[1], extPhysicalBuff_,
+                                               markerValue_, 0, 0, 0, 0);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteSignalAMD failed");
+    error_ = _wrapper->clFinish(cmd_queues_[1]);
+    CHECK_RESULT(error_, "clFinish failed");
+  } else {
+    memset(outputArr_, 0, bufSize_);
+    error_ = _wrapper->clEnqueueWaitSignalAMD(
+        cmd_queues_[0], busAddressableBuff_, markerValue_, 0, 0, 0);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWaitSignalAMD failed");
+    error_ = _wrapper->clEnqueueReadBuffer(cmd_queues_[0], busAddressableBuff_,
+                                           CL_TRUE, 0, bufSize_, outputArr_, 0,
+                                           0, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteBuffer failed");
+    success_ = true;
+    for (cl_uint i = 0; i < bufSize_ / sizeof(cl_uint); ++i) {
+      success_ &= (outputArr_[i] == i + 2);
+    }
+  }
+}
+
+void OCLSDI::testEnqueueCopyBuffer(int threadID) {
+  if (threadID == 0) {
+    memset(inputArr_, (_openTest + 1), bufSize_);
+    error_ = _wrapper->clEnqueueWriteBuffer(cmd_queues_[1], srcBuff_, CL_TRUE,
+                                            0, bufSize_, inputArr_, 0, 0, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteBuffer failed");
+    error_ = _wrapper->clEnqueueCopyBuffer(cmd_queues_[1], srcBuff_,
+                                           extPhysicalBuff_, 0, 0, bufSize_, 0,
+                                           NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueCopyBuffer failed");
+    error_ = _wrapper->clEnqueueWriteSignalAMD(cmd_queues_[1], extPhysicalBuff_,
+                                               markerValue_, 0, 0, 0, 0);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteSignalAMD failed");
+    error_ = _wrapper->clFinish(cmd_queues_[1]);
+    CHECK_RESULT(error_, "clFinish failed");
+  } else {
+    readAndVerifyResult();
+  }
+}
+
+void OCLSDI::testEnqueueWriteBuffer(int threadID) {
+  if (threadID == 0) {
+    memset(inputArr_, (_openTest + 1), bufSize_);
+    error_ = _wrapper->clEnqueueWriteBuffer(cmd_queues_[1], extPhysicalBuff_,
+                                            CL_TRUE, 0, bufSize_, inputArr_, 0,
+                                            0, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteBuffer failed");
+    error_ = _wrapper->clEnqueueWriteSignalAMD(cmd_queues_[1], extPhysicalBuff_,
+                                               markerValue_, 0, 0, 0, 0);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteSignalAMD failed");
+    error_ = _wrapper->clFinish(cmd_queues_[1]);
+    CHECK_RESULT(error_, "clFinish failed");
+  } else {
+    readAndVerifyResult();
+  }
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLSDI.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSDI.h
new file mode 100644
index 0000000000..cf19d2d014
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSDI.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_OCLSDI_H_
+#define _OCL_OCLSDI_H_
+#include <string>
+
+#include "OCLTestImp.h"
+
+class OCLSDI : public OCLTestImp {
+ public:
+  OCLSDI();
+  virtual ~OCLSDI();
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+  void threadEntry(int threadID);
+
+ private:
+  void testEnqueueWriteBuffer(int threadID);
+  void testEnqueueCopyBuffer(int threadID);
+  void testEnqueueNDRangeKernel(int threadID);
+  void testEnqueueMapBuffer(int threadID);
+  void testEnqueueWriteBufferRect(int threadID);
+  void testEnqueueCopyImageToBuffer(int threadID);
+  void readAndVerifyResult();
+
+  bool silentFailure;
+  cl_context contexts_[2];
+  cl_device_id devices_[2];
+  cl_command_queue cmd_queues_[2];
+  cl_mem extPhysicalBuff_;
+  cl_mem busAddressableBuff_;
+  cl_int error_;
+  cl_bus_address_amd busAddr_;
+  cl_uint* inputArr_;
+  cl_uint* outputArr_;
+  unsigned int bufSize_;
+  bool success_;
+  cl_uint markerValue_;
+  cl_mem srcBuff_;
+  cl_program program_;
+  cl_kernel kernel_;
+  cl_mem image_;
+  std::string deviceNames_;
+};
+#endif  // _OCL_OCLSDI_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLSVM.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSVM.cpp
new file mode 100644
index 0000000000..cf78a51517
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSVM.cpp
@@ -0,0 +1,612 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLSVM.h"
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <cstdlib>
+#ifdef _WIN32
+#include <intrin.h>
+#include <windows.h>
+#endif
+#include <iostream>
+
+#define NUM_SIZES 6
+
+#define OCL_CHECK(error)                                                 \
+  if (error != CL_SUCCESS) {                                             \
+    fprintf(stderr, "OpenCL API invocation failed at %s:%d\n", __FILE__, \
+            __LINE__);                                                   \
+    exit(-1);                                                            \
+  }
+
+#define STR(__macro__) #__macro__
+
+#ifdef _WIN32
+size_t getTotalSystemMemory() {
+  MEMORYSTATUSEX status;
+  status.dwLength = sizeof(status);
+  GlobalMemoryStatusEx(&status);
+  return status.ullTotalPhys;
+}
+#endif
+
+template <typename T, unsigned N>
+static unsigned countOf(const T (&)[N]) {
+  return N;
+}
+
+const static char* sources[] = {
+    STR(__kernel void test(__global int* ptr) {
+      ptr[get_global_id(0)] = 0xDEADBEEF;
+    }),
+    STR(__kernel void test(__global int* ptr, __global int* ptr2) {
+      ptr[get_global_id(0)] = 0xDEADBEEF;
+      ptr2[get_global_id(0)] = 0xDEADF00D;
+    }),
+    STR(__kernel void test(__global long* ptr) {
+      ptr[get_global_id(0) * 1024] = 0xBAADF00D;
+    }),
+    STR(__kernel void test(__global ulong* ptr) {
+      while (ptr) {
+        *ptr = 0xDEADBEEF;
+        ptr = *((__global ulong*)(ptr + 1));
+      }
+    }),
+    STR(__kernel void test(__global volatile int* ptr, int numIterations) {
+      for (int i = 0; i < numIterations; i++) {
+        // This should be:
+        // atomic_fetch_add_explicit(ptr, 1, memory_order_relaxed,
+        //                           memory_scope_all_svm_devices);
+        // But using device atomics is mapped to the same ISA and compiles
+        // in OpenCL 1.2
+        atomic_inc(ptr);
+      }
+    }),
+    STR(__kernel void test(){
+        // dummy
+    }),
+    STR(__kernel void test(int8 arg0, __global int* arg1, int arg2,
+                           __global int* arg3, __global float* arg4){
+        // dummy
+    }),
+    STR(__kernel void test(__global int* ptr, int to) {
+      // dummy kernel that takes a long time to complete
+      for (int i = 0; i < to; ++i) {
+        // avoid compiler optimizations
+        if (ptr[get_global_id(0)] != 17) {
+          ptr[get_global_id(0)]++;
+        } else {
+          ptr[get_global_id(0)] += 2;
+        }
+      }
+    }),
+    STR(__kernel void test(){
+        // dummy
+    })};
+
+OCLSVM::OCLSVM() { _numSubTests = countOf(sources); }
+
+OCLSVM::~OCLSVM() {}
+
+void OCLSVM::open(unsigned int test, char* units, double& conversion,
+                  unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_ERROR(error_, "Error opening test");
+  _openTest = test;
+
+  if (!isOpenClSvmAvailable(devices_[_deviceId])) {
+    printf("Device does not support any SVM features, skipping...\n");
+    return;
+  }
+
+  program_ = _wrapper->clCreateProgramWithSource(
+      context_, 1, sources + _openTest, NULL, &error_);
+  CHECK_ERROR(error_, "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
+                                    "-cl-std=CL2.0", NULL, NULL);
+  CHECK_ERROR(error_, "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "test", &error_);
+  CHECK_ERROR(error_, "clCreateKernel() failed");
+}
+
+#ifndef CL_VERSION_2_0
+// make sure the tests compile in OpenCL <= 1.2
+void OCLSVM::runFineGrainedBuffer() {}
+void OCLSVM::runFineGrainedSystem() {}
+void OCLSVM::runFineGrainedSystemLargeAllocations() {}
+void OCLSVM::runLinkedListSearchUsingFineGrainedSystem() {}
+void OCLSVM::runPlatformAtomics() {}
+void OCLSVM::runEnqueueOperations() {}
+void OCLSVM::runSvmArgumentsAreRecognized() {}
+void OCLSVM::runSvmCommandsExecutedInOrder() {}
+void OCLSVM::runIdentifySvmBuffers() {}
+#else
+
+void OCLSVM::runFineGrainedBuffer() {
+  if (!(svmCaps_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER)) {
+    printf(
+        "Device does not support fined-grained buffer sharing, skipping "
+        "test...\n");
+    return;
+  }
+  const size_t numElements = 256;
+  int* ptr = (int*)clSVMAlloc(context_,
+                              CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER,
+                              numElements * sizeof(int), 0);
+  CHECK_RESULT(!ptr, "clSVMAlloc() failed");
+
+  error_ = clSetKernelArgSVMPointer(kernel_, 0, ptr);
+  CHECK_ERROR(error_, "clSetKernelArgSVMPointer() failed");
+
+  size_t gws[1] = {numElements};
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, NULL, 0, NULL, NULL);
+  CHECK_ERROR(error_, "clEnqueueNDRangeKernel() failed");
+
+  error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
+  CHECK_ERROR(error_, "Queue::finish() failed");
+
+  size_t matchingElements = std::count(ptr, ptr + numElements, (int)0xDEADBEEF);
+  CHECK_RESULT(matchingElements != numElements, "Expected: %zd, found:%zd",
+               numElements, matchingElements);
+  clSVMFree(context_, ptr);
+}
+
+void OCLSVM::runFineGrainedSystem() {
+  if (!(svmCaps_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM)) {
+    printf(
+        "Device does not support fined-grained system sharing, skipping "
+        "test...\n");
+    return;
+  }
+
+  const size_t numElements = 256;
+  int* ptr = new int[numElements];
+  int* ptr2 = new int[numElements];
+  error_ = clSetKernelArgSVMPointer(kernel_, 0, ptr);
+  CHECK_ERROR(error_, "clSetKernelArgSVMPointer() failed");
+
+  error_ = clSetKernelArgSVMPointer(kernel_, 1, ptr2);
+  CHECK_ERROR(error_, "clSetKernelArgSVMPointer() failed");
+
+  size_t gws[1] = {numElements};
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, NULL, 0, NULL, NULL);
+  CHECK_ERROR(error_, "clEnqueueNDRangeKernel() failed");
+
+  error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
+  CHECK_ERROR(error_, "Queue::finish() failed");
+
+  size_t matchingElements = std::count(ptr, ptr + numElements, (int)0xDEADBEEF);
+  size_t matchingElements2 =
+      std::count(ptr2, ptr2 + numElements, (int)0xDEADF00D);
+  CHECK_RESULT(matchingElements + matchingElements2 != 2 * numElements,
+               "Expected: %zd, found:%zd", numElements * 2,
+               matchingElements + matchingElements2);
+  delete[] ptr;
+  delete[] ptr2;
+}
+
+void OCLSVM::runFineGrainedSystemLargeAllocations() {
+#ifdef _WIN32
+  if (!(svmCaps_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM)) {
+    printf(
+        "Device does not support fined-grained system sharing on Lnx, skipping "
+        "test...\n");
+    return;
+  }
+
+  // Max allowed multiplier for malloc
+  size_t allowedMemSize = getTotalSystemMemory() >> 12;
+
+  size_t numElements = 256;
+
+  char* s = getenv("OCLSVM_MALLOC_GB_SIZE");
+  char* s2 = getenv("OCLSVM_MEMSET_ALLOC");
+
+  for (int j = 1; j <= NUM_SIZES; j++) {
+    numElements = 131072 * j;
+
+    if (s != NULL) numElements = 131072 * atoi(s);
+
+    if (numElements > allowedMemSize) break;
+
+    void* ptr = malloc(numElements * 1024 * sizeof(uint64_t));
+    CHECK_ERROR(ptr == NULL, "malloc failure");
+
+    if (s2 != NULL) memset(ptr, 0, numElements * 1024 * sizeof(uint64_t));
+
+    error_ = clSetKernelArgSVMPointer(kernel_, 0, ptr);
+    CHECK_ERROR(error_, "clSetKernelArgSVMPointer() failed");
+
+    size_t gws[1] = {numElements};
+    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                              NULL, gws, NULL, 0, NULL, NULL);
+    CHECK_ERROR(error_, "clEnqueueNDRangeKernel() failed");
+
+    error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
+    CHECK_ERROR(error_, "Queue::finish() failed");
+
+    uint64_t* ptr64 = reinterpret_cast<uint64_t*>(ptr);
+    // Do a check
+    for (int i = 0; i < numElements; i++) {
+      if ((int)ptr64[i * 1024] != 0xBAADF00D) {
+        uint64_t temp = ptr64[i * 1024];
+        delete[] ptr;
+        CHECK_RESULT(temp != 0xBAADF00D, "Found: %d, Expected:%d", temp,
+                     0xBAADF00D);
+      }
+    }
+    delete[] ptr;
+  }
+#endif
+}
+
+void OCLSVM::runLinkedListSearchUsingFineGrainedSystem() {
+  if (!(svmCaps_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM)) {
+    printf(
+        "Device does not support fined-grained system sharing, skipping "
+        "test...\n");
+    return;
+  }
+
+  uint64_t input[] = {34, 6, 0, 11, 89, 34, 6, 6, 6, 0xDEADBEEF};
+  int inputSize = countOf(input);
+  Node* ptr = NULL;
+  for (int i = 0; i < inputSize; i++) {
+    ptr = new Node(input[i], ptr);
+  }
+  error_ = clSetKernelArgSVMPointer(kernel_, 0, ptr);
+  CHECK_ERROR(error_, "clSetKernelArgSVMPointer() failed");
+
+  size_t gws[1] = {1};
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, NULL, 0, NULL, NULL);
+  CHECK_ERROR(error_, "clEnqueueNDRangeKernel() failed");
+
+  error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
+  CHECK_ERROR(error_, "Queue::finish() failed");
+
+  int matchingElements = 0;
+  // verify result while deallocating resources at the same time
+  while (ptr) {
+    if (ptr->value_ == 0xDEADBEEF) {
+      matchingElements++;
+    }
+    Node* tmp = ptr;
+    ptr = (Node*)ptr->next_;
+    delete tmp;
+  }
+  CHECK_RESULT(matchingElements != inputSize, "Expected: %d, found:%d",
+               inputSize, matchingElements);
+}
+
+static int atomicIncrement(volatile int* loc) {
+#if defined(_MSC_VER)
+  return _InterlockedIncrement((volatile long*)loc);
+#elif defined(__GNUC__)
+  return __sync_fetch_and_add(loc, 1);
+#endif
+  printf("Atomic increment not supported, aborting...");
+  std::abort();
+  return 0;
+}
+
+void OCLSVM::runPlatformAtomics() {
+  if (!(svmCaps_ & CL_DEVICE_SVM_ATOMICS)) {
+    printf("SVM atomics not supported, skipping test...\n");
+    return;
+  }
+
+  volatile int* value = (volatile int*)clSVMAlloc(
+      context_, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(int),
+      0);
+  CHECK_RESULT(!value, "clSVMAlloc() failed");
+  *value = 0;
+  const int numIterations = 1000000;
+  error_ = clSetKernelArgSVMPointer(kernel_, 0, (const void*)value);
+  CHECK_ERROR(error_, "clSetKernelArgSVMPointer() failed");
+
+  error_ = clSetKernelArg(kernel_, 1, sizeof(numIterations), &numIterations);
+  CHECK_ERROR(error_, "clSetKernelArg() failed");
+
+  size_t gws[1] = {1};
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, NULL, 0, NULL, NULL);
+  CHECK_ERROR(error_, "clEnqueueNDRangeKernel() failed");
+
+  for (int i = 0; i < numIterations; i++) {
+    atomicIncrement(value);
+  }
+
+  error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
+  CHECK_ERROR(error_, "Queue::finish() failed");
+
+  int expected = numIterations * 2;
+  CHECK_RESULT(*value != expected, "Expected: %d, found:%d", expected, *value);
+  clSVMFree(context_, (void*)value);
+}
+
+void OCLSVM::runEnqueueOperations() {
+  size_t numElements = 32;
+  size_t size = numElements * 4;
+  int* ptr0 = (int*)clSVMAlloc(context_, 0, size, 0);
+  CHECK_RESULT(!ptr0, "clSVMAlloc() failed");
+  int* ptr1 = (int*)clSVMAlloc(context_, 0, size, 0);
+  CHECK_RESULT(!ptr1, "clSVMAlloc() failed");
+  cl_event userEvent = clCreateUserEvent(context_, &error_);
+  CHECK_ERROR(error_, "clCreateUserEvent() failed");
+
+  cl_command_queue queue = cmdQueues_[_deviceId];
+  // coarse-grained buffer semantics: the SVM pointer needs to be mapped
+  // before the pointer can write to it
+  error_ =
+      clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_WRITE, ptr0, size, 0, NULL, NULL);
+  CHECK_ERROR(error_, "clEnqueueSVMMap() failed");
+  std::fill(ptr0, ptr0 + numElements, 1);
+  error_ = clEnqueueSVMUnmap(queue, ptr0, 0, NULL, NULL);
+  CHECK_ERROR(error_, "clEnqueueSVMUnmap() failed");
+
+  // we copy the 1st buffer into the 2nd buffer
+  error_ = clEnqueueSVMMemcpy(queue, true, ptr1, ptr0, size, 0, NULL, NULL);
+  CHECK_ERROR(error_, "clEnqueueSVMMemcpy() failed");
+
+  // verification: the 2nd buffer should be identical to the 1st
+  error_ = clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_READ, ptr1, size, 0, NULL,
+                           &userEvent);
+  CHECK_ERROR(error_, "clEnqueueSVMMap() failed");
+
+  error_ = clWaitForEvents(1, &userEvent);
+  CHECK_ERROR(error_, "clWaitForEvents() failed");
+
+  size_t observed = std::count(ptr1, ptr1 + numElements, 1);
+  size_t expected = numElements;
+  CHECK_RESULT(observed != expected, "Expected: %zd, found:%zd", expected,
+               observed);
+
+  void* ptrs[2] = {ptr0, ptr1};
+  error_ =
+      clEnqueueSVMFree(queue, countOf(ptrs), ptrs, NULL, NULL, 0, NULL, NULL);
+  CHECK_ERROR(error_, "clEnqueueSVMFree() failed");
+  error_ = clFinish(queue);
+  CHECK_ERROR(error_, "clFinish() failed");
+}
+
+/**
+ * Simple test to ensure that SVM pointer arguments are identified properly in
+ * the runtime, since kernel arguments of pointer type can be bound to either
+ * SVM pointers or cl_mem objects.
+ */
+void OCLSVM::runSvmArgumentsAreRecognized() {
+  cl_int8 arg0;
+  error_ = clSetKernelArg(kernel_, 0, sizeof(arg0), &arg0);
+  CHECK_ERROR(error_, "clSetKernelArg() failed");
+
+  error_ = clSetKernelArgSVMPointer(kernel_, 1, NULL);
+  CHECK_ERROR(error_, "clSetKernelArgSVMPointer() failed");
+
+  cl_int arg2;
+  error_ = clSetKernelArg(kernel_, 2, sizeof(arg2), &arg2);
+  CHECK_ERROR(error_, "clSetKernelArg() failed");
+
+  error_ = clSetKernelArgSVMPointer(kernel_, 3, NULL);
+  CHECK_ERROR(error_, "clSetKernelArgSVMPointer() failed");
+
+  cl_mem arg4 = NULL;
+  error_ = clSetKernelArg(kernel_, 4, sizeof(arg4), &arg4);
+  CHECK_ERROR(error_, "clSetKernelArg() failed");
+
+  size_t gws[1] = {1};
+
+  // run dummy kernel
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, NULL, 0, NULL, NULL);
+  CHECK_ERROR(error_, "clEnqueueNDRangeKernel() failed");
+  error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
+  CHECK_ERROR(error_, "Queue::finish() failed");
+
+  // now we bind a pointer argument to a standard buffer instead of a SVM one
+  cl_mem buffer = NULL;
+  error_ = clSetKernelArg(kernel_, 1, sizeof(buffer), &buffer);
+  CHECK_ERROR(error_, "clSetKernelArg() failed");
+
+  // re-execute the dummy kernel using different actual parameters
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, gws, NULL, 0, NULL, NULL);
+  CHECK_ERROR(error_, "clEnqueueNDRangeKernel() failed");
+  error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
+  CHECK_ERROR(error_, "Queue::finish() failed");
+}
+
+void OCLSVM::runSvmCommandsExecutedInOrder() {
+  const int numElements = 100000;
+  size_t size = numElements * sizeof(int);
+  // allocate SVM memory
+  int* data = (int*)clSVMAlloc(context_, CL_MEM_READ_WRITE, size, 0);
+  CHECK_RESULT(!data, "clSVMAlloc failed");
+
+  // map the SVM buffer to host
+  cl_int status = clEnqueueSVMMap(cmdQueues_[_deviceId], CL_TRUE, CL_MAP_WRITE,
+                                  data, size, 0, NULL, NULL);
+  CHECK_ERROR(status, "Error when mapping SVM buffer");
+
+  // fill buffer with 0s
+  std::fill(data, data + numElements, 0);
+
+  // unmap the SVM buffer to host
+  status = clEnqueueSVMUnmap(cmdQueues_[_deviceId], data, 0, NULL, NULL);
+  CHECK_ERROR(status, "Error when unmapping SVM buffer");
+
+  // enqueue kernel
+  status = clSetKernelArgSVMPointer(kernel_, 0, data);
+  CHECK_ERROR(status, "Error when setting kernel argument");
+  status = clSetKernelArg(kernel_, 1, sizeof(int), &numElements);
+  CHECK_ERROR(status, "clSetKernelArg() failed");
+
+  cl_event event;
+  size_t overallSize = (size_t)numElements;
+  status = clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL,
+                                  &overallSize, NULL, 0, NULL, &event);
+  CHECK_ERROR(status, "Error when enqueuing kernel");
+  error_ = clFinish(cmdQueues_[_deviceId]);
+  CHECK_ERROR(status, "clFinish()");
+
+  // map the SVM buffer to host
+  status = clEnqueueSVMMap(cmdQueues_[_deviceId], CL_TRUE, CL_MAP_READ, data,
+                           size, 0, NULL, NULL);
+  CHECK_ERROR(status, "Error when mapping SVM buffer");
+
+  bool pass = true;
+  // verify the data. Using descending order might increase the chance of
+  // finding an error since the GPU (when used) might not have finished
+  // updating the data array by the time we do the verification
+  for (int i = numElements - 1; i >= 0; i--) {
+    if (data[i] != numElements + 1) {
+      pass = false;
+      break;
+    }
+  }
+
+  // unmap the SVM buffer to host
+  status = clEnqueueSVMUnmap(cmdQueues_[_deviceId], data, 0, NULL, NULL);
+  CHECK_ERROR(status, "Error when unmapping SVM buffer");
+
+  // free the SVM buffer
+  status = clEnqueueSVMFree(cmdQueues_[_deviceId], 1, (void**)&data, NULL, NULL,
+                            0, NULL, NULL);
+  CHECK_ERROR(status, "Error when freeing the SVM buffer");
+  error_ = clFinish(cmdQueues_[_deviceId]);
+  CHECK_ERROR(error_, "clFinish() failed");
+  CHECK_RESULT(!pass, "Wrong result");
+}
+
+void OCLSVM::runIdentifySvmBuffers() {
+  size_t size = 1024 * 1024;
+
+  // dummy allocation to force the runtime to track several SVM buffers
+  clSVMAlloc(context_, CL_MEM_READ_WRITE, size * 10, 0);
+
+  void* ptr = clSVMAlloc(context_, CL_MEM_READ_WRITE, size, 0);
+  cl_int status;
+  cl_bool usesSVMpointer = CL_FALSE;
+
+  // dummy allocation to force the runtime to track several SVM buffers
+  clSVMAlloc(context_, CL_MEM_READ_WRITE, size * 4, 0);
+
+  // buffer using the entire SVM region should be identified as such
+  cl_mem buf1 =
+      clCreateBuffer(context_, CL_MEM_USE_HOST_PTR, size, ptr, &status);
+  CHECK_ERROR(status, "clCreateBuffer failed.");
+
+  size_t paramSize = 0;
+  status = clGetMemObjectInfo(buf1, CL_MEM_USES_SVM_POINTER, 0, 0, &paramSize);
+  CHECK_ERROR(status, "clGetMemObjectInfo failed");
+  CHECK_RESULT(paramSize != sizeof(cl_bool),
+               "clGetMemObjectInfo(CL_MEM_USES_SVM_POINTER) "
+               "returned wrong size.");
+
+  status = clGetMemObjectInfo(buf1, CL_MEM_USES_SVM_POINTER, sizeof(cl_bool),
+                              &usesSVMpointer, 0);
+  CHECK_ERROR(status, "clGetMemObjectInfo failed");
+  CHECK_RESULT(usesSVMpointer != CL_TRUE,
+               "clGetMemObjectInfo(CL_MEM_USES_SVM_POINTER) "
+               "returned CL_FALSE for buffer created from SVM pointer.");
+
+  // Buffer that uses random region within SVM buffers
+  cl_mem buf2 = clCreateBuffer(context_, CL_MEM_USE_HOST_PTR, 256,
+                               (char*)ptr + size - 256, &status);
+  CHECK_ERROR(status, "clCreateBuffer failed.");
+
+  status = clGetMemObjectInfo(buf2, CL_MEM_USES_SVM_POINTER, sizeof(cl_bool),
+                              &usesSVMpointer, 0);
+  CHECK_ERROR(status, "clGetMemObjectInfo failed");
+  CHECK_RESULT(usesSVMpointer != CL_TRUE,
+               "clGetMemObjectInfo(CL_MEM_USES_SVM_POINTER) "
+               "returned CL_FALSE for buffer created from SVM pointer.");
+
+  // for any other pointer the query should return false
+  void* randomPtr = malloc(size);
+  cl_mem buf3 =
+      clCreateBuffer(context_, CL_MEM_USE_HOST_PTR, size, randomPtr, &status);
+  CHECK_ERROR(status, "clCreateBuffer failed.");
+
+  status = clGetMemObjectInfo(buf3, CL_MEM_USES_SVM_POINTER, sizeof(cl_bool),
+                              &usesSVMpointer, 0);
+  CHECK_ERROR(status, "clGetMemObjectInfo failed");
+  CHECK_RESULT(usesSVMpointer == CL_TRUE,
+               "clGetMemObjectInfo(CL_MEM_USES_SVM_POINTER) "
+               "returned CL_TRUE for buffer not created from SVM pointer.");
+
+  clReleaseMemObject(buf3);
+  clReleaseMemObject(buf2);
+  clReleaseMemObject(buf1);
+  clSVMFree(context_, ptr);
+}
+#endif
+
+cl_bool OCLSVM::isOpenClSvmAvailable(cl_device_id device_id) {
+#ifdef CL_VERSION_2_0
+  error_ = clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_SVM_CAPABILITIES,
+                           sizeof(svmCaps_), &svmCaps_, NULL);
+  CHECK_ERROR_NO_RETURN(error_, "clGetDeviceInfo() failed");
+  if (!(svmCaps_ & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER)) {
+    return CL_FALSE;
+  } else {
+    return CL_TRUE;
+  }
+#endif
+  // -Device does not support OpenCL >= 2.0
+  // -Device supports OpenCL >= 2.0, but available headers are <= 1.2
+  return CL_FALSE;
+}
+
+void OCLSVM::run() {
+  if (!isOpenClSvmAvailable(devices_[_deviceId])) {
+    printf("Device does not support any SVM features, skipping...\n");
+    return;
+  }
+
+  if (_openTest == 0) {
+    runFineGrainedBuffer();
+  } else if (_openTest == 1) {
+    runFineGrainedSystem();
+  } else if (_openTest == 2) {
+    runFineGrainedSystemLargeAllocations();
+  } else if (_openTest == 3) {
+    runLinkedListSearchUsingFineGrainedSystem();
+  } else if (_openTest == 4) {
+    runPlatformAtomics();
+  } else if (_openTest == 5) {
+    runEnqueueOperations();
+  } else if (_openTest == 6) {
+    runSvmArgumentsAreRecognized();
+  } else if (_openTest == 7) {
+    runSvmCommandsExecutedInOrder();
+  } else if (_openTest == 8) {
+    runIdentifySvmBuffers();
+  }
+}
+
+unsigned int OCLSVM::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLSVM.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSVM.h
new file mode 100644
index 0000000000..f861081fed
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSVM.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_SVM_H_
+#define _OCL_SVM_H_
+
+#include <CL/cl.h>
+
+#include "OCLTestImp.h"
+#include "stdint.h"
+
+class OCLSVM : public OCLTestImp {
+ public:
+  OCLSVM();
+
+  virtual ~OCLSVM();
+
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+
+  virtual void run(void);
+
+  virtual unsigned int close(void);
+
+ private:
+  void runFineGrainedBuffer();
+  void runFineGrainedSystem();
+  void runFineGrainedSystemLargeAllocations();
+  void runLinkedListSearchUsingFineGrainedSystem();
+  void runPlatformAtomics();
+  void runEnqueueOperations();
+  void runSvmArgumentsAreRecognized();
+  void runSvmCommandsExecutedInOrder();
+  void runIdentifySvmBuffers();
+  cl_bool isOpenClSvmAvailable(cl_device_id device_id);
+
+  uint64_t svmCaps_;
+};
+
+struct Node {
+  Node(uint64_t value, Node* next) : value_(value), next_((uint64_t)next) {}
+
+  uint64_t value_;
+  uint64_t next_;
+};
+
+#endif  // _OCL_SVM_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLSemaphore.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSemaphore.cpp
new file mode 100644
index 0000000000..9804633196
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSemaphore.cpp
@@ -0,0 +1,225 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLSemaphore.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#ifndef CL_DEVICE_MAX_SEMAPHORES_AMD
+#define CL_DEVICE_MAX_SEMAPHORES_AMD 0x1041
+#else
+#error "CL_DEVICE_MAX_SEMAPHORES_AMD is defined somewhere, remove this define!"
+#endif
+#ifndef CL_DEVICE_MAX_SEMAPHORE_SIZE_AMD
+#define CL_DEVICE_MAX_SEMAPHORE_SIZE_AMD 0x1042
+#else
+#error \
+    "CL_DEVICE_MAX_SEMAPHORE_SIZE_AMD is defined somewhere, remove this define!"
+#endif
+#ifndef CL_KERNEL_MAX_SEMAPHORE_SIZE_AMD
+#define CL_KERNEL_MAX_SEMAPHORE_SIZE_AMD 0x1043
+#else
+#error \
+    "CL_KERNEL_MAX_SEMAPHORE_SIZE_AMD is defined somewhere, remove this define!"
+#endif
+
+const static unsigned int MaxSemaphores = 1;
+
+const static char* strKernel =
+    "#ifdef cl_amd_semaphore\n"
+    "#pragma OPENCL EXTENSION cl_amd_semaphore : enable            \n"
+    "kernel void sema_test(sema_t lock, global int* a, global int* b, int "
+    "value)\n"
+    "  {\n"
+    "    size_t idx = get_global_id(0);\n"
+    "    size_t gdx = get_group_id(0);\n"
+    "    size_t ng = get_num_groups(0);\n"
+    "    size_t ssize = get_max_semaphore_size();\n"
+    "    a[1] = true;\n"
+    "    if (gdx >= ssize) {\n"
+    "      return;\n"
+    "    }\n"
+    "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
+    "    semaphore_init(lock, ng);\n"
+    "    while (a[1]) {\n"
+    "      atom_add(a, b[idx]);\n"
+    "      atom_inc(a + 2);\n"
+    "      if (gdx == (ssize - 1)) {\n"
+    "        semaphore_signal(lock);\n"
+    "        if (a[0] >= value) {\n"
+    "          a[1] = false;\n"
+    "        }\n"
+    "      } else {\n"
+    "        semaphore_wait(lock);\n"
+    "        idx += get_global_size(0);\n"
+    "      }\n"
+    "    }\n"
+    "    semaphore_signal(lock);\n"
+    "  }\n"
+    "#endif\n";
+
+OCLSemaphore::OCLSemaphore() {
+  _numSubTests = 1;
+  hasSemaphore = false;
+}
+
+OCLSemaphore::~OCLSemaphore() {}
+
+void OCLSemaphore::open(unsigned int test, char* units, double& conversion,
+                        unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  char name[1024] = {0};
+  size_t size = 0;
+  _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 1024,
+                            name, &size);
+  if (!strstr(name, "cl_amd_semaphore")) {
+    error_ = CL_DEVICE_NOT_FOUND;
+    hasSemaphore = false;
+    printf("Semaphore extension is required for this test!\n");
+    return;
+  } else {
+    hasSemaphore = true;
+  }
+  _wrapper->clGetDeviceInfo(devices_[deviceId],
+                            (cl_device_info)CL_DEVICE_MAX_SEMAPHORES_AMD,
+                            sizeof(size), &size, NULL);
+  _wrapper->clGetDeviceInfo(devices_[deviceId],
+                            (cl_device_info)CL_DEVICE_MAX_SEMAPHORE_SIZE_AMD,
+                            sizeof(size), &size, NULL);
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "sema_test", &error_);
+  _wrapper->clGetKernelInfo(kernel_,
+                            (cl_kernel_info)CL_KERNEL_MAX_SEMAPHORE_SIZE_AMD,
+                            sizeof(size), &size, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+  for (unsigned int i = 0; i < MaxSemaphores; ++i) {
+    buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                                      sizeof(cl_uint), NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+    buffers_.push_back(buffer);
+  }
+
+  buffer =
+      _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                               1024 * size * sizeof(cl_uint), NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+  buffer =
+      _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
+                               1024 * size * sizeof(cl_uint), NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLSemaphore::run(void) {
+  if (!hasSemaphore) {
+    return;
+  }
+  cl_uint initVal[2] = {5, 10};
+
+  for (unsigned int i = 0; i < MaxSemaphores; ++i) {
+    cl_mem buffer = buffers()[i];
+    error_ = _wrapper->clSetKernelArg(kernel_, i, sizeof(cl_uint), &initVal[i]);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+  }
+
+  cl_mem buffer = buffers()[MaxSemaphores];
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, MaxSemaphores, sizeof(cl_mem), &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+  buffer = buffers()[MaxSemaphores + 1];
+  error_ = _wrapper->clSetKernelArg(kernel_, MaxSemaphores + 1, sizeof(cl_mem),
+                                    &buffer);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  cl_int val = 64;
+  error_ =
+      _wrapper->clSetKernelArg(kernel_, MaxSemaphores + 2, sizeof(val), &val);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+
+  size_t gws[1] = {64};
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[0], kernel_, 1, NULL,
+                                            gws, NULL, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+
+  cl_uint outputV[MaxSemaphores] = {0};
+
+  // Find the new counter value
+  initVal[0]++;
+  initVal[1]--;
+
+  for (unsigned int i = 0; i < MaxSemaphores; ++i) {
+    cl_mem buffer = buffers()[i];
+    error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[0], buffers()[i], true, 0,
+                                           sizeof(cl_uint), &outputV[i], 0,
+                                           NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed");
+    if (initVal[i] != outputV[i]) {
+      printf("%u != %u", initVal[i], outputV[i]);
+      CHECK_RESULT(true, " - Incorrect result for counter!\n");
+    }
+  }
+
+  // Restore the original value to check the returned result in the kernel
+  initVal[0]--;
+  initVal[1]++;
+
+  buffer = buffers()[MaxSemaphores];
+  error_ = _wrapper->clEnqueueReadBuffer(
+      cmdQueues_[0], buffers()[MaxSemaphores], true, 0,
+      MaxSemaphores * sizeof(cl_uint), outputV, 0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed");
+  for (unsigned int i = 0; i < MaxSemaphores; ++i) {
+    if (initVal[i] != outputV[i]) {
+      printf("%u != %u", initVal[i], outputV[i]);
+      CHECK_RESULT(true,
+                   " - Incorrect result for counter inside kernel. Returned "
+                   "value != original.\n");
+    }
+  }
+}
+
+unsigned int OCLSemaphore::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLSemaphore.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSemaphore.h
new file mode 100644
index 0000000000..9d7aa54dd7
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSemaphore.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_SEMAPHORE_H_
+#define _OCL_SEMAPHORE_H_
+
+#include "OCLTestImp.h"
+
+class OCLSemaphore : public OCLTestImp {
+ public:
+  OCLSemaphore();
+  virtual ~OCLSemaphore();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+  bool hasSemaphore;
+};
+
+#endif  // _OCL_SEMAPHORE_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLStablePState.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLStablePState.cpp
new file mode 100644
index 0000000000..98709241d8
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLStablePState.cpp
@@ -0,0 +1,129 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLStablePState.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+cl_device_id gpu_device;
+
+OCLStablePState::OCLStablePState() {
+  _numSubTests = 1;
+  failed_ = false;
+}
+
+OCLStablePState::~OCLStablePState() {}
+
+void OCLStablePState::open(unsigned int test, char* units, double& conversion,
+                           unsigned int deviceId) {
+  cl_uint numPlatforms;
+  cl_platform_id platform = NULL;
+  cl_uint num_devices = 0;
+  cl_device_id* devices = NULL;
+  cl_device_id device = NULL;
+  _deviceId = deviceId;
+
+  if (type_ != CL_DEVICE_TYPE_GPU) {
+    error_ = CL_DEVICE_NOT_FOUND;
+    printf("GPU device is required for this test!\n");
+    return;
+  }
+
+  error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+  if (0 < numPlatforms) {
+    cl_platform_id* platforms = new cl_platform_id[numPlatforms];
+    error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
+#if 0
+    // Get last for default
+    platform = platforms[numPlatforms - 1];
+    for (unsigned i = 0; i < numPlatforms; ++i) {
+#endif
+    platform = platforms[_platformIndex];
+    char pbuf[100];
+    error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
+                                         CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
+                                         NULL);
+    num_devices = 0;
+    /* Get the number of requested devices */
+    error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
+                                      &num_devices);
+#if 0
+    }
+#endif
+    delete platforms;
+  }
+  /*
+   * If we could find our platform, use it. If not, die as we need the AMD
+   * platform for these extensions.
+   */
+  CHECK_RESULT(platform == 0,
+               "Couldn't find platform with GPU devices, cannot proceed");
+
+  devices = (cl_device_id*)malloc(num_devices * sizeof(cl_device_id));
+  CHECK_RESULT(devices == 0, "no devices");
+
+  /* Get the requested device */
+  error_ =
+      _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
+
+  CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
+  device = devices[_deviceId];
+  gpu_device = device;
+}
+
+static void CL_CALLBACK notify_callback(cl_event event,
+                                        cl_int event_command_exec_status,
+                                        void* user_data) {}
+
+void OCLStablePState::run(void) {
+  if (failed_) {
+    return;
+  }
+  cl_set_device_clock_mode_input_amd setClockModeInput;
+  setClockModeInput.clock_mode = CL_DEVICE_CLOCK_MODE_PROFILING_AMD;
+  cl_set_device_clock_mode_output_amd setClockModeOutput = {};
+  error_ = _wrapper->clSetDeviceClockModeAMD(gpu_device, setClockModeInput,
+                                             &setClockModeOutput);
+#ifdef ATI_OS_WIN
+  CHECK_RESULT(error_ != CL_SUCCESS, "SetClockMode profiling failed\n");
+#else
+  error_ = CL_SUCCESS;
+#endif
+
+  setClockModeInput.clock_mode = CL_DEVICE_CLOCK_MODE_DEFAULT_AMD;
+  setClockModeOutput = {};
+  error_ = _wrapper->clSetDeviceClockModeAMD(gpu_device, setClockModeInput,
+                                             &setClockModeOutput);
+#ifdef ATI_OS_WIN
+  CHECK_RESULT(error_ != CL_SUCCESS, "SetClockMode default failed\n");
+#else
+  error_ = CL_SUCCESS;
+#endif
+}
+
+unsigned int OCLStablePState::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLStablePState.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLStablePState.h
new file mode 100644
index 0000000000..ec2e6750ff
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLStablePState.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_STABLE_PSTATE_H_
+#define _OCL_STABLE_PSTATE_H_
+
+#include "OCLTestImp.h"
+
+class OCLStablePState : public OCLTestImp {
+ public:
+  OCLStablePState();
+  virtual ~OCLStablePState();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+};
+
+#endif  // _OCL_STABLE_PSTATE_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLThreadTrace.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLThreadTrace.cpp
new file mode 100644
index 0000000000..785e27c874
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLThreadTrace.cpp
@@ -0,0 +1,344 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLThreadTrace.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+
+const static unsigned int IOThreadTrace = 3;  // number of input/oputput buffers
+static size_t SeNum = 1;                      // number of SEs
+const static unsigned int ttBufSize = 30000;  // size of thread trace buffer
+const static unsigned int InputElements = 2048;  // elements in each vector
+
+const static char* strKernel =
+    "__kernel void thread_trace_test(                                       \n"
+    "   __global int *A,__global int *B,__global int *C)                    \n"
+    "{                                                                      \n"
+    "   int idx = get_global_id(0);                                         \n"
+    "   C[idx] = A[idx] + B[idx];                                           \n"
+    "}                                                                      \n";
+
+OCLThreadTrace::OCLThreadTrace() {
+  _numSubTests = 1;
+  failed_ = false;
+  clCreateThreadTraceAMD_ = 0;
+  clReleaseThreadTraceAMD_ = 0;
+  clRetainThreadTraceAMD_ = 0;
+  clGetThreadTraceInfoAMD_ = 0;
+  clSetThreadTraceParamAMD_ = 0;
+  clEnqueueThreadTraceCommandAMD_ = 0;
+  clEnqueueBindThreadTraceBufferAMD_ = 0;
+  ioBuf_ = 0;
+  ttBuf_ = 0;
+}
+
+OCLThreadTrace::~OCLThreadTrace() {}
+
+void OCLThreadTrace::open(unsigned int test, char* units, double& conversion,
+                          unsigned int deviceId) {
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening");
+
+  if (deviceId >= deviceCount_) {
+    failed_ = true;
+    return;
+  }
+
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    printf("GPU device is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+
+  size_t threadTraceEnabled;
+  size_t retsize;
+  error_ = _wrapper->clGetDeviceInfo(
+      devices_[deviceId], CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD,
+      sizeof(threadTraceEnabled), &threadTraceEnabled, &retsize);
+  CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
+
+  if (!threadTraceEnabled) {
+    failed_ = true;
+    testDescString = "Not supported";
+    return;
+  }
+
+  unsigned int datasize = sizeof(unsigned int) * InputElements;
+
+  ioBuf_ = (unsigned int**)malloc(IOThreadTrace * sizeof(unsigned int*));
+  CHECK_RESULT((ioBuf_ == NULL), "malloc  failed");
+
+  memset(ioBuf_, 0, IOThreadTrace * sizeof(unsigned int*));
+  for (unsigned i = 0; i < IOThreadTrace; ++i) {
+    ioBuf_[i] = (unsigned int*)malloc(datasize);
+    CHECK_RESULT((ioBuf_[i] == NULL), "malloc  failed");
+    for (unsigned j = 0; j < InputElements; ++j) {
+      ioBuf_[i][j] = j;
+    }
+  }
+
+  clCreateThreadTraceAMD_ =
+      (fnp_clCreateThreadTraceAMD)_wrapper->clGetExtensionFunctionAddress(
+          "clCreateThreadTraceAMD");
+  CHECK_RESULT((clCreateThreadTraceAMD_ == 0),
+               "clGetExtensionFunctionAddress(clCreateThreadTraceAMD) failed");
+  clGetThreadTraceInfoAMD_ =
+      (fnp_clGetThreadTraceInfoAMD)_wrapper->clGetExtensionFunctionAddress(
+          "clGetThreadTraceInfoAMD");
+  CHECK_RESULT((clGetThreadTraceInfoAMD_ == 0),
+               "clGetExtensionFunctionAddress(clGetThreadTraceInfoAMD) failed");
+
+  threadTrace_ = clCreateThreadTraceAMD_(devices_[_deviceId], &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateThreadTraceAMD() failed");
+
+  // Get number of shader engines
+  clGetThreadTraceInfoAMD_(threadTrace_, CL_THREAD_TRACE_SE, sizeof(SeNum),
+                           &SeNum, NULL);
+
+  ttBuf_ = (unsigned int**)malloc(SeNum * sizeof(unsigned int*));
+  CHECK_RESULT((ttBuf_ == NULL), "malloc  failed");
+
+  memset(ttBuf_, 0, SeNum * sizeof(unsigned int*));
+
+  program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
+                                                 &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource()  failed");
+
+  error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
+                                    NULL, NULL);
+  if (error_ != CL_SUCCESS) {
+    char programLog[1024];
+    _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
+                                    CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
+    printf("\n%s\n", programLog);
+    fflush(stdout);
+  }
+  CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
+
+  kernel_ = _wrapper->clCreateKernel(program_, "thread_trace_test", &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
+
+  cl_mem buffer;
+  for (unsigned int i = 0; i < IOThreadTrace; ++i) {
+    buffer = _wrapper->clCreateBuffer(context_,
+                                      CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                                      datasize, ioBuf_[i], &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+    buffers_.push_back(buffer);
+  }
+
+  for (unsigned int i = 0; i < SeNum; ++i) {
+    buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, ttBufSize,
+                                      NULL, &error_);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+    buffers_.push_back(buffer);
+  }
+
+  clReleaseThreadTraceAMD_ =
+      (fnp_clReleaseThreadTraceAMD)_wrapper->clGetExtensionFunctionAddress(
+          "clReleaseThreadTraceAMD");
+  CHECK_RESULT((clReleaseThreadTraceAMD_ == 0),
+               "clGetExtensionFunctionAddress(clReleaseThreadTraceAMD) failed");
+  clRetainThreadTraceAMD_ =
+      (fnp_clRetainThreadTraceAMD)_wrapper->clGetExtensionFunctionAddress(
+          "clRetainThreadTraceAMD");
+  CHECK_RESULT((clRetainThreadTraceAMD_ == 0),
+               "clGetExtensionFunctionAddress(clRetainThreadTraceAMD) failed");
+  clSetThreadTraceParamAMD_ =
+      (fnp_clSetThreadTraceParamAMD)_wrapper->clGetExtensionFunctionAddress(
+          "clSetThreadTraceParamAMD");
+  CHECK_RESULT(
+      (clSetThreadTraceParamAMD_ == 0),
+      "clGetExtensionFunctionAddress(clSetThreadTraceParamAMD) failed");
+  clEnqueueThreadTraceCommandAMD_ = (fnp_clEnqueueThreadTraceCommandAMD)
+                                        _wrapper->clGetExtensionFunctionAddress(
+                                            "clEnqueueThreadTraceCommandAMD");
+  CHECK_RESULT(
+      (clEnqueueThreadTraceCommandAMD_ == 0),
+      "clGetExtensionFunctionAddress(clEnqueueThreadTraceCommandAMD) failed");
+  clEnqueueBindThreadTraceBufferAMD_ =
+      (fnp_clEnqueueBindThreadTraceBufferAMD)_wrapper
+          ->clGetExtensionFunctionAddress("clEnqueueBindThreadTraceBufferAMD");
+  CHECK_RESULT((clEnqueueBindThreadTraceBufferAMD_ == 0),
+               "clGetExtensionFunctionAddress("
+               "clEnqueueBindThreadTraceBufferAMD) failed");
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+static void DumpTraceSI(unsigned int index, cl_ushort* tracePtr,
+                        size_t numOfBytes) {
+  FILE* outFile;
+  char file_name[16] = {0};
+  static unsigned int iii = 0;
+  sprintf(file_name, "TTrace%d%d.out", index, iii++);
+
+  outFile = fopen(file_name, "w");
+
+  for (size_t i = 0; i < numOfBytes / 2; i++) {
+    fprintf(outFile, "%04x\n", (cl_ushort)(*tracePtr));
+    tracePtr++;
+  }
+
+  fclose(outFile);
+}
+
+#define DUMPTRACE 0
+
+void OCLThreadTrace::run(void) {
+  cl_mem* ttArrBuf = 0;
+  unsigned int* ttBufRecordedSizes = 0;
+  unsigned int i = 0, j = 0;
+
+  if (failed_) {
+    return;
+  }
+
+  for (i = 0; i < IOThreadTrace; ++i) {
+    cl_mem buffer = buffers()[i];
+    error_ = _wrapper->clSetKernelArg(kernel_, i, sizeof(cl_mem), &buffer);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
+  }
+
+  size_t globalWorkSize[1];
+  size_t localWorkSize[1];
+  globalWorkSize[0] = InputElements;
+  localWorkSize[0] = 32;
+
+  ttArrBuf = (cl_mem*)malloc(sizeof(cl_mem) * SeNum);
+  ;
+  for (i = 0; i < SeNum; i++) ttArrBuf[i] = buffers()[IOThreadTrace + i];
+
+  cl_event clEvent;
+  error_ = clEnqueueBindThreadTraceBufferAMD_(
+      cmdQueues_[_deviceId], threadTrace_, ttArrBuf, (cl_uint)SeNum, ttBufSize,
+      0, NULL, &clEvent);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clEnqueueBindThreadTraceBufferAMD() failed");
+
+  error_ = clEnqueueThreadTraceCommandAMD_(cmdQueues_[_deviceId], threadTrace_,
+                                           CL_THREAD_TRACE_BEGIN_COMMAND, 0,
+                                           NULL, &clEvent);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clEnqueueThreadTraceCommandAMD() failed");
+
+  error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
+                                            NULL, globalWorkSize, localWorkSize,
+                                            0, NULL, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
+  clFinish(cmdQueues_[_deviceId]);
+
+  error_ = clEnqueueThreadTraceCommandAMD_(cmdQueues_[_deviceId], threadTrace_,
+                                           CL_THREAD_TRACE_END_COMMAND, 0, NULL,
+                                           &clEvent);
+  CHECK_RESULT((error_ != CL_SUCCESS),
+               "clEnqueueThreadTraceCommandAMD() failed");
+
+  ttBufRecordedSizes = (unsigned int*)malloc(sizeof(unsigned int) * SeNum);
+  memset(ttBufRecordedSizes, 0, sizeof(unsigned int) * SeNum);
+  size_t ttBufRecordedSize;
+  error_ = clGetThreadTraceInfoAMD_(threadTrace_, CL_THREAD_TRACE_BUFFERS_SIZE,
+                                    1, NULL, &ttBufRecordedSize);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clGetThreadTraceInfoAMD() failed");
+
+  if (ttBufRecordedSize > sizeof(unsigned int) * SeNum) {
+    free(ttBufRecordedSizes);
+    ttBufRecordedSizes = (unsigned int*)malloc(ttBufRecordedSize);
+    memset(ttBufRecordedSizes, 0, ttBufRecordedSize);
+  }
+
+  error_ =
+      clGetThreadTraceInfoAMD_(threadTrace_, CL_THREAD_TRACE_BUFFERS_SIZE,
+                               ttBufRecordedSize, ttBufRecordedSizes, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clGetThreadTraceInfoAMD() failed");
+
+  for (i = 0; i < SeNum; ++i) {
+    ttBuf_[i] = (cl_uint*)malloc(ttBufRecordedSizes[i] * sizeof(cl_uint));
+    CHECK_RESULT((ttBuf_[i] == NULL), "malloc  failed");
+  }
+
+  for (i = 0; i < SeNum; ++i) {
+    if (ttBufRecordedSizes[i] != 0) {
+      error_ = _wrapper->clEnqueueReadBuffer(
+          cmdQueues_[_deviceId], buffers()[IOThreadTrace + i], CL_TRUE, 0,
+          ttBufRecordedSizes[i], ttBuf_[i], 0, NULL, NULL);
+      CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed");
+#if DUMPTRACE
+      DumpTraceSI(i, (cl_ushort*)ttBuf_[i], ttBufRecordedSizes[i]);
+#endif
+    }
+  }
+
+  bool validRes = true;
+  for (i = 0; i < SeNum; ++i) {
+    unsigned j;
+    for (j = 0; j < ttBufRecordedSizes[i]; ++j) {
+      if (ttBuf_[i][j] != 0) {
+        break;
+      }
+    }
+    if (j >= ttBufRecordedSizes[i] && ttBufRecordedSizes[i] > 0) {
+      validRes = false;
+      break;
+    }
+  }
+  if (!validRes) {
+    CHECK_RESULT(
+        true,
+        " - Incorrect result for thread trace. no output data was recorded.\n");
+  }
+
+  if (ttArrBuf) free(ttArrBuf);
+  if (ttBufRecordedSizes) free(ttBufRecordedSizes);
+}
+
+unsigned int OCLThreadTrace::close(void) {
+  if (clReleaseThreadTraceAMD_ && threadTrace_)
+    clReleaseThreadTraceAMD_(threadTrace_);
+
+  if (ioBuf_) {
+    for (unsigned i = 0; i < IOThreadTrace; ++i) {
+      if (ioBuf_[i]) {
+        free(ioBuf_[i]);
+      }
+    }
+    free(ioBuf_);
+  }
+  if (ttBuf_) {
+    for (unsigned i = 0; i < SeNum; ++i) {
+      if (ttBuf_[i]) {
+        free(ttBuf_[i]);
+      }
+    }
+    free(ttBuf_);
+  }
+  return OCLTestImp::close();
+}
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLThreadTrace.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLThreadTrace.h
new file mode 100644
index 0000000000..6995b499b6
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLThreadTrace.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_THREAD_TRACE_H_
+#define _OCL_THREAD_TRACE_H_
+
+#include "OCLTestImp.h"
+#include "cl_thread_trace_amd.h"
+
+// Thread Trace API
+typedef CL_API_ENTRY cl_threadtrace_amd(
+    CL_API_CALL *fnp_clCreateThreadTraceAMD)(cl_device_id, cl_int *);
+typedef CL_API_ENTRY cl_int(CL_API_CALL *fnp_clReleaseThreadTraceAMD)(
+    cl_threadtrace_amd);
+typedef CL_API_ENTRY cl_int(CL_API_CALL *fnp_clRetainThreadTraceAMD)(
+    cl_threadtrace_amd);
+typedef CL_API_ENTRY cl_int(CL_API_CALL *fnp_clGetThreadTraceInfoAMD)(
+    cl_threadtrace_amd, cl_threadtrace_info, size_t, void *, size_t *);
+typedef CL_API_ENTRY cl_int(CL_API_CALL *fnp_clSetThreadTraceParamAMD)(
+    cl_threadtrace_amd, cl_thread_trace_param, cl_uint);
+typedef CL_API_ENTRY cl_int(CL_API_CALL *fnp_clEnqueueThreadTraceCommandAMD)(
+    cl_command_queue, cl_threadtrace_amd, cl_threadtrace_command_name_amd,
+    cl_uint, const cl_event *, cl_event *);
+typedef CL_API_ENTRY cl_int(CL_API_CALL *fnp_clEnqueueBindThreadTraceBufferAMD)(
+    cl_command_queue, cl_threadtrace_amd, cl_mem *, cl_uint, cl_uint, cl_uint,
+    const cl_event *, cl_event *);
+
+class OCLThreadTrace : public OCLTestImp {
+ public:
+  OCLThreadTrace();
+  virtual ~OCLThreadTrace();
+
+ public:
+  virtual void open(unsigned int test, char *units, double &conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+  cl_uint **ioBuf_;
+  cl_uint **ttBuf_;
+  cl_threadtrace_amd threadTrace_;
+
+  fnp_clCreateThreadTraceAMD clCreateThreadTraceAMD_;
+  fnp_clReleaseThreadTraceAMD clReleaseThreadTraceAMD_;
+  fnp_clRetainThreadTraceAMD clRetainThreadTraceAMD_;
+  fnp_clGetThreadTraceInfoAMD clGetThreadTraceInfoAMD_;
+  fnp_clSetThreadTraceParamAMD clSetThreadTraceParamAMD_;
+  fnp_clEnqueueThreadTraceCommandAMD clEnqueueThreadTraceCommandAMD_;
+  fnp_clEnqueueBindThreadTraceBufferAMD clEnqueueBindThreadTraceBufferAMD_;
+};
+
+#endif  // _OCL_THREAD_TRACE_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLUnalignedCopy.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLUnalignedCopy.cpp
new file mode 100644
index 0000000000..fc7298e087
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLUnalignedCopy.cpp
@@ -0,0 +1,127 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLUnalignedCopy.h"
+
+#include <Timer.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+static const int BufSize = 64;
+
+OCLUnalignedCopy::OCLUnalignedCopy() {
+  _numSubTests = 1;
+  failed_ = false;
+}
+
+OCLUnalignedCopy::~OCLUnalignedCopy() {}
+
+void OCLUnalignedCopy::open(unsigned int test, char* units, double& conversion,
+                            unsigned int deviceId) {
+  _deviceId = deviceId;
+  OCLTestImp::open(test, units, conversion, deviceId);
+  CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
+
+  cl_device_type deviceType;
+  error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
+                                     sizeof(deviceType), &deviceType, NULL);
+  CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
+
+  if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
+    printf("GPU device is required for this test!\n");
+    failed_ = true;
+    return;
+  }
+  cl_mem buffer;
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY,
+                                    BufSize * sizeof(cl_int4), NULL, &error_);
+  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
+  buffers_.push_back(buffer);
+
+  buffer = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
+                                    BufSize * sizeof(cl_int4), NULL, &error_);
+  buffers_.push_back(buffer);
+}
+
+static void CL_CALLBACK notify_callback(const char* errinfo,
+                                        const void* private_info, size_t cb,
+                                        void* user_data) {}
+
+void OCLUnalignedCopy::run(void) {
+  if (failed_) {
+    return;
+  }
+
+  char* values = new char[BufSize];
+  char* results = new char[BufSize];
+
+  for (int i = 0; i < BufSize; ++i) {
+    values[i] = i;
+  }
+
+  static const char TestCnt = 7;
+  char sizes[TestCnt][3] = {
+      {5, 7, 13},   {5, 7, 12},   {4, 9, 12},   {4, 9, 15},
+      {27, 16, 15}, {27, 16, 13}, {32, 16, 13},
+  };
+
+  for (int i = 0; i < TestCnt; ++i) {
+    error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], buffers_[0],
+                                            CL_FALSE, 0, BufSize, values, 0,
+                                            NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
+
+    cl_uint pattern = 0;
+    error_ = /*_wrapper->*/ clEnqueueFillBuffer(
+        cmdQueues_[_deviceId], buffers_[1], &pattern, sizeof(pattern), 0,
+        BufSize, 0, NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueFillBuffer() failed");
+
+    error_ = _wrapper->clEnqueueCopyBuffer(
+        cmdQueues_[_deviceId], buffers_[0], buffers_[1], sizes[i][0],
+        sizes[i][1], sizes[i][2], 0, NULL, NULL);
+    CHECK_RESULT(error_, "clEnqueueCopyBuffer failed");
+
+    error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[1],
+                                           CL_TRUE, 0, BufSize, results, 0,
+                                           NULL, NULL);
+    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed");
+
+    for (int j = 0; j < sizes[i][1]; ++j) {
+      CHECK_RESULT(results[j] != 0, "Comparison failed");
+    }
+    for (int j = sizes[i][1], k = 0; j < (sizes[i][1] + sizes[i][2]);
+         ++j, ++k) {
+      CHECK_RESULT(results[j] != sizes[i][0] + k, "Comparison failed");
+    }
+    for (int j = (sizes[i][1] + sizes[i][2]); j < BufSize; ++j) {
+      CHECK_RESULT(results[j] != 0, "Comparison failed");
+    }
+  }
+
+  delete[] values;
+  delete[] results;
+}
+
+unsigned int OCLUnalignedCopy::close(void) { return OCLTestImp::close(); }
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLUnalignedCopy.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLUnalignedCopy.h
new file mode 100644
index 0000000000..18c764af86
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLUnalignedCopy.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef _OCL_UNALIGNED_COPY_H_
+#define _OCL_UNALIGNED_COPY_H_
+
+#include "OCLTestImp.h"
+
+class OCLUnalignedCopy : public OCLTestImp {
+ public:
+  OCLUnalignedCopy();
+  virtual ~OCLUnalignedCopy();
+
+ public:
+  virtual void open(unsigned int test, char* units, double& conversion,
+                    unsigned int deviceID);
+  virtual void run(void);
+  virtual unsigned int close(void);
+
+ private:
+  bool failed_;
+};
+
+#endif  // _OCL_UNALIGNED_COPY_H_
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/TestList.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/TestList.cpp
new file mode 100644
index 0000000000..4d03b22ee8
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/TestList.cpp
@@ -0,0 +1,129 @@
+/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "OCLTestListImp.h"
+
+//
+// Includes for tests
+//
+#include "OCLAsyncMap.h"
+#include "OCLAsyncTransfer.h"
+#include "OCLAtomicCounter.h"
+#include "OCLBlitKernel.h"
+#include "OCLBufferFromImage.h"
+#include "OCLCPUGuardPages.h"
+#include "OCLCreateBuffer.h"
+#include "OCLCreateContext.h"
+#include "OCLCreateImage.h"
+#include "OCLDeviceAtomic.h"
+#include "OCLDeviceQueries.h"
+#include "OCLDynamic.h"
+#include "OCLDynamicBLines.h"
+#include "OCLGenericAddressSpace.h"
+#include "OCLGetQueueThreadID.h"
+#include "OCLGlobalOffset.h"
+#include "OCLImage2DFromBuffer.h"
+#include "OCLImageCopyPartial.h"
+#include "OCLKernelBinary.h"
+#include "OCLLDS32K.h"
+#include "OCLLinearFilter.h"
+#include "OCLLiquidFlash.h"
+#include "OCLMapCount.h"
+#include "OCLMemDependency.h"
+#include "OCLMemObjs.h"
+#include "OCLMemoryInfo.h"
+#include "OCLMultiQueue.h"
+#include "OCLOfflineCompilation.h"
+#include "OCLP2PBuffer.h"
+#include "OCLPartialWrkgrp.h"
+#include "OCLPerfCounters.h"
+#include "OCLPersistent.h"
+#include "OCLPinnedMemory.h"
+#include "OCLPlatformAtomics.h"
+#include "OCLProgramScopeVariables.h"
+#include "OCLRTQueue.h"
+#include "OCLReadWriteImage.h"
+#include "OCLSDI.h"
+#include "OCLSVM.h"
+#include "OCLSemaphore.h"
+#include "OCLStablePState.h"
+#include "OCLThreadTrace.h"
+#include "OCLUnalignedCopy.h"
+
+//
+//  Helper macro for adding tests
+//
+template <typename T>
+static void* dictionary_CreateTestFunc(void) {
+  return new T();
+}
+
+#define TEST(name) \
+  { #name, &dictionary_CreateTestFunc < name> }
+
+TestEntry TestList[] = {
+    TEST(OCLCreateContext),
+    TEST(OCLAtomicCounter),
+    TEST(OCLKernelBinary),
+    TEST(OCLGlobalOffset),
+    TEST(OCLLinearFilter),
+    TEST(OCLAsyncTransfer),
+    TEST(OCLLDS32K),
+    TEST(OCLMemObjs),
+    TEST(OCLSemaphore),
+    TEST(OCLPartialWrkgrp),
+    TEST(OCLCreateBuffer),
+    TEST(OCLCreateImage),
+    TEST(OCLCPUGuardPages),
+    TEST(OCLMapCount),
+    TEST(OCLMemoryInfo),
+    TEST(OCLOfflineCompilation),
+    TEST(OCLMemDependency),
+    TEST(OCLGetQueueThreadID),
+    TEST(OCLDeviceQueries),
+    TEST(OCLSDI),
+    TEST(OCLThreadTrace),
+    TEST(OCLMultiQueue),
+    TEST(OCLImage2DFromBuffer),
+    TEST(OCLBufferFromImage),
+    TEST(OCLPerfCounters),
+    TEST(OCLSVM),
+    TEST(OCLProgramScopeVariables),
+    TEST(OCLGenericAddressSpace),
+    TEST(OCLDynamic),
+    TEST(OCLPlatformAtomics),
+    TEST(OCLDeviceAtomic),
+    TEST(OCLDynamicBLines),
+    TEST(OCLUnalignedCopy),
+    TEST(OCLBlitKernel),
+    TEST(OCLLiquidFlash),
+    TEST(OCLRTQueue),
+    TEST(OCLAsyncMap),
+    TEST(OCLPinnedMemory),
+    TEST(OCLReadWriteImage),
+    TEST(OCLStablePState),
+    TEST(OCLP2PBuffer),
+    // Failures in Linux. IOL doesn't support tiling aperture and Cypress linear
+    // image writes TEST(OCLPersistent),
+};
+
+unsigned int TestListCount = sizeof(TestList) / sizeof(TestList[0]);
+unsigned int TestLibVersion = 0;
+const char* TestLibName = "oclruntime";
diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/oclruntime.exclude b/projects/clr/opencl/tests/ocltst/module/runtime/oclruntime.exclude
new file mode 100644
index 0000000000..a5807cb63c
--- /dev/null
+++ b/projects/clr/opencl/tests/ocltst/module/runtime/oclruntime.exclude
@@ -0,0 +1,7 @@
+# all clear
+OCLImageCopyPartial
+
+# EPR 362715
+OCLCPUGuardPages
+
+OCLRegionDeviceQueries