From 18ce996fe2a5ff9c2cdec3da3a3001fac190a875 Mon Sep 17 00:00:00 2001 From: Vlad Sytchenko Date: Fri, 29 May 2020 12:10:04 -0400 Subject: [PATCH] Initial source drop of ocltst This only adds source files for ocltst and the following test modules - oclruntime, oclperf, oclgl, ocldx. There's no build files for now. Change-Id: I0f8d9d074c45d82e92f7d30bf22753102f272f4f [ROCm/clr commit: 75e6add24d0a46e7fe5325e23ad9d5721365f036] --- projects/clr/opencl/tests/ocltst/env/Module.h | 54 + .../opencl/tests/ocltst/env/ResultStruct.h | 71 + .../clr/opencl/tests/ocltst/env/Timer.cpp | 111 ++ projects/clr/opencl/tests/ocltst/env/Timer.h | 46 + projects/clr/opencl/tests/ocltst/env/Worker.h | 180 ++ .../opencl/tests/ocltst/env/oclsysinfo.cpp | 162 ++ .../clr/opencl/tests/ocltst/env/oclsysinfo.h | 28 + .../clr/opencl/tests/ocltst/env/ocltst.cpp | 1611 +++++++++++++++++ projects/clr/opencl/tests/ocltst/env/pfm.cpp | 79 + projects/clr/opencl/tests/ocltst/env/pfm.h | 28 + .../opencl/tests/ocltst/include/OCL/Thread.h | 148 ++ .../clr/opencl/tests/ocltst/include/OCLLog.h | 47 + .../clr/opencl/tests/ocltst/include/OCLTest.h | 73 + .../opencl/tests/ocltst/include/OCLTestList.h | 43 + .../tests/ocltst/include/OCLTestUtils.h | 31 + .../opencl/tests/ocltst/include/OCLWrapper.h | 614 +++++++ .../opencl/tests/ocltst/log/oclTestLog.cpp | 104 ++ .../clr/opencl/tests/ocltst/log/oclTestLog.h | 44 + .../ocltst/module/common/BaseTestImp.cpp | 185 ++ .../ocltst/module/common/OCLGLCommon.cpp | 175 ++ .../tests/ocltst/module/common/OCLGLCommon.h | 80 + .../ocltst/module/common/OCLGLCommonLinux.cpp | 239 +++ .../module/common/OCLGLCommonWindows.cpp | 239 +++ .../tests/ocltst/module/common/OCLTestImp.cpp | 288 +++ .../ocltst/module/common/OCLTestListImp.cpp | 70 + .../ocltst/module/common/OCLTestUtils.cpp | 46 + .../tests/ocltst/module/common/OCLThread.cpp | 209 +++ .../tests/ocltst/module/common/OCLWrapper.cpp | 944 ++++++++++ .../tests/ocltst/module/common/Timer.cpp | 112 ++ .../opencl/tests/ocltst/module/common/Timer.h | 46 + .../tests/ocltst/module/dx/OCLDX11Common.cpp | 236 +++ .../tests/ocltst/module/dx/OCLDX11Common.h | 68 + .../tests/ocltst/module/dx/OCLDX11YUY2.cpp | 478 +++++ .../tests/ocltst/module/dx/OCLDX11YUY2.h | 56 + .../tests/ocltst/module/dx/TestList.cpp | 52 + .../tests/ocltst/module/dx/ocldx.exclude | 1 + .../tests/ocltst/module/gl/OCLGLBuffer.cpp | 220 +++ .../tests/ocltst/module/gl/OCLGLBuffer.h | 42 + .../module/gl/OCLGLBufferMultipleQueues.cpp | 303 ++++ .../module/gl/OCLGLBufferMultipleQueues.h | 48 + .../ocltst/module/gl/OCLGLDepthBuffer.cpp | 270 +++ .../tests/ocltst/module/gl/OCLGLDepthBuffer.h | 66 + .../tests/ocltst/module/gl/OCLGLDepthTex.cpp | 278 +++ .../tests/ocltst/module/gl/OCLGLDepthTex.h | 62 + .../tests/ocltst/module/gl/OCLGLFenceSync.cpp | 481 +++++ .../tests/ocltst/module/gl/OCLGLFenceSync.h | 55 + .../ocltst/module/gl/OCLGLMsaaTexture.cpp | 298 +++ .../tests/ocltst/module/gl/OCLGLMsaaTexture.h | 68 + .../ocltst/module/gl/OCLGLMultiContext.cpp | 231 +++ .../ocltst/module/gl/OCLGLMultiContext.h | 54 + .../tests/ocltst/module/gl/OCLGLTexture.cpp | 144 ++ .../tests/ocltst/module/gl/OCLGLTexture.h | 214 +++ .../tests/ocltst/module/gl/TestList.cpp | 54 + .../tests/ocltst/module/gl/oclgl.exclude | 1 + .../tests/ocltst/module/include/BaseTestImp.h | 206 +++ .../tests/ocltst/module/include/OCLTestImp.h | 83 + .../ocltst/module/include/OCLTestListImp.h | 86 + .../tests/ocltst/module/include/OclIncludes.h | 32 + .../module/perf/OCLPerf3DImageWriteSpeed.cpp | 211 +++ .../module/perf/OCLPerf3DImageWriteSpeed.h | 49 + .../ocltst/module/perf/OCLPerfAES256.cpp | 451 +++++ .../tests/ocltst/module/perf/OCLPerfAES256.h | 58 + .../ocltst/module/perf/OCLPerfAtomicSpeed.cpp | 817 +++++++++ .../ocltst/module/perf/OCLPerfAtomicSpeed.h | 119 ++ .../module/perf/OCLPerfAtomicSpeed20.cpp | 509 ++++++ .../ocltst/module/perf/OCLPerfAtomicSpeed20.h | 102 ++ .../module/perf/OCLPerfAtomicSpeed20Kernels.h | 73 + .../module/perf/OCLPerfAtomicSpeedKernels.h | 402 ++++ .../module/perf/OCLPerfBufferCopyOverhead.cpp | 254 +++ .../module/perf/OCLPerfBufferCopyOverhead.h | 50 + .../module/perf/OCLPerfBufferCopySpeed.cpp | 439 +++++ .../module/perf/OCLPerfBufferCopySpeed.h | 65 + .../module/perf/OCLPerfBufferReadSpeed.cpp | 334 ++++ .../module/perf/OCLPerfBufferReadSpeed.h | 65 + .../module/perf/OCLPerfBufferWriteSpeed.cpp | 333 ++++ .../module/perf/OCLPerfBufferWriteSpeed.h | 65 + .../ocltst/module/perf/OCLPerfCPUMemSpeed.cpp | 304 ++++ .../ocltst/module/perf/OCLPerfCPUMemSpeed.h | 59 + .../module/perf/OCLPerfCommandQueue.cpp | 146 ++ .../ocltst/module/perf/OCLPerfCommandQueue.h | 42 + .../ocltst/module/perf/OCLPerfConcurrency.cpp | 563 ++++++ .../ocltst/module/perf/OCLPerfConcurrency.h | 63 + .../module/perf/OCLPerfDevMemReadSpeed.cpp | 243 +++ .../module/perf/OCLPerfDevMemReadSpeed.h | 47 + .../module/perf/OCLPerfDevMemWriteSpeed.cpp | 212 +++ .../module/perf/OCLPerfDevMemWriteSpeed.h | 46 + .../module/perf/OCLPerfDeviceConcurrency.cpp | 480 +++++ .../module/perf/OCLPerfDeviceConcurrency.h | 60 + .../module/perf/OCLPerfDeviceEnqueue.cpp | 227 +++ .../ocltst/module/perf/OCLPerfDeviceEnqueue.h | 47 + .../module/perf/OCLPerfDeviceEnqueue2.cpp | 260 +++ .../module/perf/OCLPerfDeviceEnqueue2.h | 54 + .../module/perf/OCLPerfDeviceEnqueueEvent.cpp | 267 +++ .../module/perf/OCLPerfDeviceEnqueueEvent.h | 54 + .../module/perf/OCLPerfDeviceEnqueueSier.cpp | 233 +++ .../module/perf/OCLPerfDeviceEnqueueSier.h | 49 + .../module/perf/OCLPerfDispatchSpeed.cpp | 391 ++++ .../ocltst/module/perf/OCLPerfDispatchSpeed.h | 58 + .../ocltst/module/perf/OCLPerfDoubleDMA.cpp | 442 +++++ .../ocltst/module/perf/OCLPerfDoubleDMA.h | 42 + .../module/perf/OCLPerfDoubleDMASeq.cpp | 291 +++ .../ocltst/module/perf/OCLPerfDoubleDMASeq.h | 43 + .../ocltst/module/perf/OCLPerfFillBuffer.cpp | 114 ++ .../ocltst/module/perf/OCLPerfFillBuffer.h | 48 + .../ocltst/module/perf/OCLPerfFillImage.cpp | 109 ++ .../ocltst/module/perf/OCLPerfFillImage.h | 45 + .../tests/ocltst/module/perf/OCLPerfFlush.cpp | 165 ++ .../tests/ocltst/module/perf/OCLPerfFlush.h | 42 + .../module/perf/OCLPerfGenericBandwidth.cpp | 309 ++++ .../module/perf/OCLPerfGenericBandwidth.h | 57 + .../module/perf/OCLPerfGenoilSiaMiner.cpp | 429 +++++ .../module/perf/OCLPerfGenoilSiaMiner.h | 78 + .../module/perf/OCLPerfImageCopyCorners.cpp | 367 ++++ .../module/perf/OCLPerfImageCopyCorners.h | 55 + .../module/perf/OCLPerfImageCopySpeed.cpp | 344 ++++ .../module/perf/OCLPerfImageCopySpeed.h | 56 + .../ocltst/module/perf/OCLPerfImageCreate.cpp | 194 ++ .../ocltst/module/perf/OCLPerfImageCreate.h | 51 + .../module/perf/OCLPerfImageMapUnmap.cpp | 333 ++++ .../ocltst/module/perf/OCLPerfImageMapUnmap.h | 57 + .../module/perf/OCLPerfImageReadSpeed.cpp | 295 +++ .../module/perf/OCLPerfImageReadSpeed.h | 61 + .../module/perf/OCLPerfImageReadWrite.cpp | 223 +++ .../module/perf/OCLPerfImageReadWrite.h | 51 + .../module/perf/OCLPerfImageReadsRGBA.cpp | 236 +++ .../module/perf/OCLPerfImageReadsRGBA.h | 52 + .../module/perf/OCLPerfImageSampleRate.cpp | 324 ++++ .../module/perf/OCLPerfImageSampleRate.h | 58 + .../module/perf/OCLPerfImageWriteSpeed.cpp | 317 ++++ .../module/perf/OCLPerfImageWriteSpeed.h | 62 + .../module/perf/OCLPerfKernelArguments.cpp | 239 +++ .../module/perf/OCLPerfKernelArguments.h | 43 + .../module/perf/OCLPerfKernelThroughput.cpp | 1008 +++++++++++ .../module/perf/OCLPerfKernelThroughput.h | 118 ++ .../ocltst/module/perf/OCLPerfLDSLatency.cpp | 432 +++++ .../ocltst/module/perf/OCLPerfLDSLatency.h | 59 + .../module/perf/OCLPerfLDSReadSpeed.cpp | 395 ++++ .../ocltst/module/perf/OCLPerfLDSReadSpeed.h | 59 + .../ocltst/module/perf/OCLPerfMandelbrot.cpp | 940 ++++++++++ .../ocltst/module/perf/OCLPerfMandelbrot.h | 75 + .../module/perf/OCLPerfMapBufferReadSpeed.cpp | 262 +++ .../module/perf/OCLPerfMapBufferReadSpeed.h | 56 + .../perf/OCLPerfMapBufferWriteSpeed.cpp | 291 +++ .../module/perf/OCLPerfMapBufferWriteSpeed.h | 58 + .../module/perf/OCLPerfMapImageReadSpeed.cpp | 213 +++ .../module/perf/OCLPerfMapImageReadSpeed.h | 49 + .../module/perf/OCLPerfMapImageWriteSpeed.cpp | 214 +++ .../module/perf/OCLPerfMapImageWriteSpeed.h | 49 + .../module/perf/OCLPerfMatrixTranspose.cpp | 326 ++++ .../module/perf/OCLPerfMatrixTranspose.h | 57 + .../ocltst/module/perf/OCLPerfMemCombine.cpp | 234 +++ .../ocltst/module/perf/OCLPerfMemCombine.h | 56 + .../ocltst/module/perf/OCLPerfMemCreate.cpp | 176 ++ .../ocltst/module/perf/OCLPerfMemCreate.h | 43 + .../ocltst/module/perf/OCLPerfMemLatency.cpp | 418 +++++ .../ocltst/module/perf/OCLPerfMemLatency.h | 61 + .../perf/OCLPerfPinnedBufferReadSpeed.cpp | 347 ++++ .../perf/OCLPerfPinnedBufferReadSpeed.h | 66 + .../perf/OCLPerfPinnedBufferWriteSpeed.cpp | 342 ++++ .../perf/OCLPerfPinnedBufferWriteSpeed.h | 66 + .../module/perf/OCLPerfPipeCopySpeed.cpp | 504 ++++++ .../ocltst/module/perf/OCLPerfPipeCopySpeed.h | 60 + .../module/perf/OCLPerfProgramGlobalRead.cpp | 549 ++++++ .../module/perf/OCLPerfProgramGlobalRead.h | 60 + .../module/perf/OCLPerfProgramGlobalWrite.cpp | 384 ++++ .../module/perf/OCLPerfProgramGlobalWrite.h | 58 + .../ocltst/module/perf/OCLPerfSHA256.cpp | 841 +++++++++ .../tests/ocltst/module/perf/OCLPerfSHA256.h | 58 + .../ocltst/module/perf/OCLPerfSVMAlloc.cpp | 263 +++ .../ocltst/module/perf/OCLPerfSVMAlloc.h | 46 + .../module/perf/OCLPerfSVMKernelArguments.cpp | 255 +++ .../module/perf/OCLPerfSVMKernelArguments.h | 47 + .../ocltst/module/perf/OCLPerfSVMMap.cpp | 153 ++ .../tests/ocltst/module/perf/OCLPerfSVMMap.h | 44 + .../ocltst/module/perf/OCLPerfSVMMemFill.cpp | 214 +++ .../ocltst/module/perf/OCLPerfSVMMemFill.h | 50 + .../ocltst/module/perf/OCLPerfSVMMemcpy.cpp | 216 +++ .../ocltst/module/perf/OCLPerfSVMMemcpy.h | 47 + .../module/perf/OCLPerfSVMSampleRate.cpp | 359 ++++ .../ocltst/module/perf/OCLPerfSVMSampleRate.h | 63 + .../ocltst/module/perf/OCLPerfSampleRate.cpp | 336 ++++ .../ocltst/module/perf/OCLPerfSampleRate.h | 60 + .../perf/OCLPerfScalarReplArrayElem.cpp | 325 ++++ .../module/perf/OCLPerfScalarReplArrayElem.h | 60 + .../ocltst/module/perf/OCLPerfSdiP2PCopy.cpp | 261 +++ .../ocltst/module/perf/OCLPerfSdiP2PCopy.h | 52 + .../tests/ocltst/module/perf/OCLPerfSepia.cpp | 586 ++++++ .../tests/ocltst/module/perf/OCLPerfSepia.h | 58 + .../module/perf/OCLPerfTextureMemLatency.cpp | 409 +++++ .../module/perf/OCLPerfTextureMemLatency.h | 60 + .../module/perf/OCLPerfUAVReadSpeed.cpp | 630 +++++++ .../ocltst/module/perf/OCLPerfUAVReadSpeed.h | 63 + .../perf/OCLPerfUAVReadSpeedHostMem.cpp | 437 +++++ .../module/perf/OCLPerfUAVReadSpeedHostMem.h | 63 + .../perf/OCLPerfUAVWriteSpeedHostMem.cpp | 380 ++++ .../module/perf/OCLPerfUAVWriteSpeedHostMem.h | 58 + .../module/perf/OCLPerfUncoalescedRead.cpp | 270 +++ .../module/perf/OCLPerfUncoalescedRead.h | 44 + .../module/perf/OCLPerfVerticalFetch.cpp | 353 ++++ .../ocltst/module/perf/OCLPerfVerticalFetch.h | 49 + .../tests/ocltst/module/perf/TestList.cpp | 191 ++ .../tests/ocltst/module/perf/oclperf.exclude | 28 + .../ocltst/module/runtime/OCLAsyncMap.cpp | 98 + .../tests/ocltst/module/runtime/OCLAsyncMap.h | 38 + .../module/runtime/OCLAsyncTransfer.cpp | 139 ++ .../ocltst/module/runtime/OCLAsyncTransfer.h | 38 + .../module/runtime/OCLAtomicCounter.cpp | 168 ++ .../ocltst/module/runtime/OCLAtomicCounter.h | 41 + .../ocltst/module/runtime/OCLBlitKernel.cpp | 612 +++++++ .../ocltst/module/runtime/OCLBlitKernel.h | 41 + .../module/runtime/OCLBufferFromImage.cpp | 289 +++ .../module/runtime/OCLBufferFromImage.h | 57 + .../module/runtime/OCLCPUGuardPages.cpp | 178 ++ .../ocltst/module/runtime/OCLCPUGuardPages.h | 49 + .../ocltst/module/runtime/OCLCreateBuffer.cpp | 173 ++ .../ocltst/module/runtime/OCLCreateBuffer.h | 47 + .../module/runtime/OCLCreateContext.cpp | 98 + .../ocltst/module/runtime/OCLCreateContext.h | 38 + .../ocltst/module/runtime/OCLCreateImage.cpp | 493 +++++ .../ocltst/module/runtime/OCLCreateImage.h | 48 + .../ocltst/module/runtime/OCLDeviceAtomic.cpp | 210 +++ .../ocltst/module/runtime/OCLDeviceAtomic.h | 44 + .../module/runtime/OCLDeviceQueries.cpp | 288 +++ .../ocltst/module/runtime/OCLDeviceQueries.h | 41 + .../ocltst/module/runtime/OCLDynamic.cpp | 225 +++ .../tests/ocltst/module/runtime/OCLDynamic.h | 43 + .../module/runtime/OCLDynamicBLines.cpp | 357 ++++ .../ocltst/module/runtime/OCLDynamicBLines.h | 54 + .../module/runtime/OCLGenericAddressSpace.cpp | 815 +++++++++ .../module/runtime/OCLGenericAddressSpace.h | 50 + .../module/runtime/OCLGetQueueThreadID.cpp | 116 ++ .../module/runtime/OCLGetQueueThreadID.h | 41 + .../ocltst/module/runtime/OCLGlobalOffset.cpp | 126 ++ .../ocltst/module/runtime/OCLGlobalOffset.h | 38 + .../module/runtime/OCLImage2DFromBuffer.cpp | 389 ++++ .../module/runtime/OCLImage2DFromBuffer.h | 56 + .../module/runtime/OCLImageCopyPartial.cpp | 347 ++++ .../module/runtime/OCLImageCopyPartial.h | 57 + .../ocltst/module/runtime/OCLKernelBinary.cpp | 252 +++ .../ocltst/module/runtime/OCLKernelBinary.h | 38 + .../tests/ocltst/module/runtime/OCLLDS32K.cpp | 371 ++++ .../tests/ocltst/module/runtime/OCLLDS32K.h | 51 + .../ocltst/module/runtime/OCLLinearFilter.cpp | 187 ++ .../ocltst/module/runtime/OCLLinearFilter.h | 38 + .../ocltst/module/runtime/OCLLiquidFlash.cpp | 264 +++ .../ocltst/module/runtime/OCLLiquidFlash.h | 57 + .../ocltst/module/runtime/OCLMapCount.cpp | 98 + .../tests/ocltst/module/runtime/OCLMapCount.h | 60 + .../module/runtime/OCLMemDependency.cpp | 153 ++ .../ocltst/module/runtime/OCLMemDependency.h | 38 + .../ocltst/module/runtime/OCLMemObjs.cpp | 139 ++ .../tests/ocltst/module/runtime/OCLMemObjs.h | 45 + .../ocltst/module/runtime/OCLMemoryInfo.cpp | 200 ++ .../ocltst/module/runtime/OCLMemoryInfo.h | 42 + .../ocltst/module/runtime/OCLMultiQueue.cpp | 295 +++ .../ocltst/module/runtime/OCLMultiQueue.h | 43 + .../module/runtime/OCLOfflineCompilation.cpp | 206 +++ .../module/runtime/OCLOfflineCompilation.h | 38 + .../ocltst/module/runtime/OCLP2PBuffer.cpp | 286 +++ .../ocltst/module/runtime/OCLP2PBuffer.h | 56 + .../module/runtime/OCLPartialWrkgrp.cpp | 292 +++ .../ocltst/module/runtime/OCLPartialWrkgrp.h | 41 + .../ocltst/module/runtime/OCLPerfCounters.cpp | 798 ++++++++ .../ocltst/module/runtime/OCLPerfCounters.h | 50 + .../ocltst/module/runtime/OCLPersistent.cpp | 139 ++ .../ocltst/module/runtime/OCLPersistent.h | 50 + .../ocltst/module/runtime/OCLPinnedMemory.cpp | 218 +++ .../ocltst/module/runtime/OCLPinnedMemory.h | 51 + .../module/runtime/OCLPlatformAtomics.cpp | 182 ++ .../module/runtime/OCLPlatformAtomics.h | 41 + .../runtime/OCLProgramScopeVariables.cpp | 274 +++ .../module/runtime/OCLProgramScopeVariables.h | 46 + .../ocltst/module/runtime/OCLRTQueue.cpp | 415 +++++ .../tests/ocltst/module/runtime/OCLRTQueue.h | 48 + .../module/runtime/OCLReadWriteImage.cpp | 372 ++++ .../ocltst/module/runtime/OCLReadWriteImage.h | 50 + .../tests/ocltst/module/runtime/OCLSDI.cpp | 515 ++++++ .../tests/ocltst/module/runtime/OCLSDI.h | 65 + .../tests/ocltst/module/runtime/OCLSVM.cpp | 612 +++++++ .../tests/ocltst/module/runtime/OCLSVM.h | 64 + .../ocltst/module/runtime/OCLSemaphore.cpp | 225 +++ .../ocltst/module/runtime/OCLSemaphore.h | 39 + .../ocltst/module/runtime/OCLStablePState.cpp | 129 ++ .../ocltst/module/runtime/OCLStablePState.h | 41 + .../ocltst/module/runtime/OCLThreadTrace.cpp | 344 ++++ .../ocltst/module/runtime/OCLThreadTrace.h | 71 + .../module/runtime/OCLUnalignedCopy.cpp | 127 ++ .../ocltst/module/runtime/OCLUnalignedCopy.h | 41 + .../tests/ocltst/module/runtime/TestList.cpp | 129 ++ .../ocltst/module/runtime/oclruntime.exclude | 7 + 290 files changed, 54116 insertions(+) create mode 100644 projects/clr/opencl/tests/ocltst/env/Module.h create mode 100644 projects/clr/opencl/tests/ocltst/env/ResultStruct.h create mode 100644 projects/clr/opencl/tests/ocltst/env/Timer.cpp create mode 100644 projects/clr/opencl/tests/ocltst/env/Timer.h create mode 100644 projects/clr/opencl/tests/ocltst/env/Worker.h create mode 100644 projects/clr/opencl/tests/ocltst/env/oclsysinfo.cpp create mode 100644 projects/clr/opencl/tests/ocltst/env/oclsysinfo.h create mode 100644 projects/clr/opencl/tests/ocltst/env/ocltst.cpp create mode 100644 projects/clr/opencl/tests/ocltst/env/pfm.cpp create mode 100644 projects/clr/opencl/tests/ocltst/env/pfm.h create mode 100644 projects/clr/opencl/tests/ocltst/include/OCL/Thread.h create mode 100644 projects/clr/opencl/tests/ocltst/include/OCLLog.h create mode 100644 projects/clr/opencl/tests/ocltst/include/OCLTest.h create mode 100644 projects/clr/opencl/tests/ocltst/include/OCLTestList.h create mode 100644 projects/clr/opencl/tests/ocltst/include/OCLTestUtils.h create mode 100644 projects/clr/opencl/tests/ocltst/include/OCLWrapper.h create mode 100644 projects/clr/opencl/tests/ocltst/log/oclTestLog.cpp create mode 100644 projects/clr/opencl/tests/ocltst/log/oclTestLog.h create mode 100644 projects/clr/opencl/tests/ocltst/module/common/BaseTestImp.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/common/OCLGLCommon.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/common/OCLGLCommon.h create mode 100644 projects/clr/opencl/tests/ocltst/module/common/OCLGLCommonLinux.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/common/OCLGLCommonWindows.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/common/OCLTestImp.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/common/OCLTestListImp.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/common/OCLTestUtils.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/common/OCLThread.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/common/OCLWrapper.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/common/Timer.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/common/Timer.h create mode 100644 projects/clr/opencl/tests/ocltst/module/dx/OCLDX11Common.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/dx/OCLDX11Common.h create mode 100644 projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.h create mode 100644 projects/clr/opencl/tests/ocltst/module/dx/TestList.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/dx/ocldx.exclude create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLBuffer.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLBuffer.h create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLBufferMultipleQueues.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLBufferMultipleQueues.h create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthBuffer.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthBuffer.h create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthTex.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthTex.h create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLFenceSync.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLFenceSync.h create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLMsaaTexture.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLMsaaTexture.h create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLMultiContext.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLMultiContext.h create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLTexture.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/OCLGLTexture.h create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/TestList.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/gl/oclgl.exclude create mode 100644 projects/clr/opencl/tests/ocltst/module/include/BaseTestImp.h create mode 100644 projects/clr/opencl/tests/ocltst/module/include/OCLTestImp.h create mode 100644 projects/clr/opencl/tests/ocltst/module/include/OCLTestListImp.h create mode 100644 projects/clr/opencl/tests/ocltst/module/include/OclIncludes.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerf3DImageWriteSpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerf3DImageWriteSpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAES256.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAES256.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20Kernels.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeedKernels.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopyOverhead.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopyOverhead.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopySpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopySpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferReadSpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferReadSpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferWriteSpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferWriteSpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCPUMemSpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCPUMemSpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCommandQueue.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCommandQueue.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfConcurrency.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfConcurrency.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemReadSpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemReadSpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemWriteSpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemWriteSpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceConcurrency.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceConcurrency.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue2.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue2.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueEvent.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueEvent.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueSier.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueSier.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDispatchSpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDispatchSpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMA.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMA.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMASeq.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMASeq.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillBuffer.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillBuffer.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillImage.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillImage.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFlush.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFlush.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenoilSiaMiner.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenoilSiaMiner.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopyCorners.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopyCorners.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopySpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopySpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCreate.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCreate.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageMapUnmap.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageMapUnmap.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadSpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadSpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadWrite.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadWrite.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadsRGBA.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadsRGBA.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageSampleRate.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageSampleRate.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageWriteSpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageWriteSpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelArguments.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelArguments.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSLatency.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSLatency.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferReadSpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferReadSpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferWriteSpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferWriteSpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageReadSpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageReadSpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageWriteSpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageWriteSpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMatrixTranspose.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMatrixTranspose.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCombine.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCombine.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCreate.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCreate.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemLatency.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemLatency.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferReadSpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferReadSpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferWriteSpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferWriteSpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPipeCopySpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPipeCopySpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalRead.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalRead.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalWrite.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalWrite.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSHA256.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSHA256.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMAlloc.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMAlloc.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMKernelArguments.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMKernelArguments.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMap.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMap.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemFill.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemFill.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemcpy.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemcpy.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMSampleRate.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMSampleRate.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSampleRate.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSampleRate.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfScalarReplArrayElem.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfScalarReplArrayElem.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSdiP2PCopy.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSdiP2PCopy.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSepia.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSepia.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfTextureMemLatency.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfTextureMemLatency.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeed.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeed.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeedHostMem.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeedHostMem.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVWriteSpeedHostMem.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVWriteSpeedHostMem.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUncoalescedRead.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUncoalescedRead.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfVerticalFetch.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/OCLPerfVerticalFetch.h create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/TestList.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/perf/oclperf.exclude create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncMap.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncMap.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncTransfer.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncTransfer.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLAtomicCounter.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLAtomicCounter.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLBlitKernel.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLBlitKernel.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLBufferFromImage.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLBufferFromImage.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLCPUGuardPages.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLCPUGuardPages.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateContext.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateContext.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateImage.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateImage.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceAtomic.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceAtomic.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamic.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamic.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamicBLines.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamicBLines.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLGenericAddressSpace.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLGenericAddressSpace.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLGetQueueThreadID.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLGetQueueThreadID.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLGlobalOffset.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLGlobalOffset.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLImage2DFromBuffer.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLImage2DFromBuffer.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLImageCopyPartial.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLImageCopyPartial.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLKernelBinary.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLKernelBinary.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLLDS32K.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLLDS32K.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLLinearFilter.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLLinearFilter.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLLiquidFlash.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLLiquidFlash.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMapCount.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMapCount.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMemObjs.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMemObjs.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMemoryInfo.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMemoryInfo.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLOfflineCompilation.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLOfflineCompilation.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLP2PBuffer.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLP2PBuffer.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPartialWrkgrp.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPartialWrkgrp.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPerfCounters.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPersistent.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPersistent.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPinnedMemory.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPinnedMemory.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPlatformAtomics.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLPlatformAtomics.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLProgramScopeVariables.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLProgramScopeVariables.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLRTQueue.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLRTQueue.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLReadWriteImage.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLReadWriteImage.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLSDI.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLSDI.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLSVM.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLSVM.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLSemaphore.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLSemaphore.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLStablePState.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLStablePState.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLThreadTrace.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLThreadTrace.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLUnalignedCopy.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/OCLUnalignedCopy.h create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/TestList.cpp create mode 100644 projects/clr/opencl/tests/ocltst/module/runtime/oclruntime.exclude diff --git a/projects/clr/opencl/tests/ocltst/env/Module.h b/projects/clr/opencl/tests/ocltst/env/Module.h new file mode 100644 index 0000000000..25e3017fa6 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/env/Module.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef OCL_TEST_MODULE_H +#define OCL_TEST_MODULE_H + +#include + +#include "OCLTest.h" +#include "OCLTestList.h" + +struct Module { + std::string name; + ModuleHandle hmodule; + TestCountFuncPtr get_count; + TestNameFuncPtr get_name; + CreateTestFuncPtr create_test; + DestroyTestFuncPtr destroy_test; + TestVersionFuncPtr get_version; + TestLibNameFuncPtr get_libname; + OCLTest** cached_test; + + Module() + : name(""), + hmodule(0), + get_count(0), + get_name(0), + create_test(0), + destroy_test(0), + get_version(0), + get_libname(0), + cached_test(0) { + // EMPTY! + } +}; + +#endif // OCL_TEST_MODULE_H diff --git a/projects/clr/opencl/tests/ocltst/env/ResultStruct.h b/projects/clr/opencl/tests/ocltst/env/ResultStruct.h new file mode 100644 index 0000000000..198a6e67ff --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/env/ResultStruct.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _RESULT_STRUCT_H_ + +struct IndicesRange { + int startIndex; + int endIndex; +}; + +#define INDEX_ALL_TESTS -1 +#define EXTREMELY_SMALL_VALUE -10000.0f +#define EXTREMELY_LARGE_VALUE 10000.0f + +class TestResult { + public: + float value; + std::string resultString; + bool passed; + + TestResult(float val) : resultString("\n"), passed(true) { value = val; } + + void reset(float val) { + value = val; + passed = true; + resultString.assign("\n"); + } +}; + +class Report { + public: + TestResult *max; + TestResult *min; + bool success; + int numFailedTests; + + Report() : success(true), numFailedTests(0) { + max = new TestResult(EXTREMELY_SMALL_VALUE); + min = new TestResult(EXTREMELY_LARGE_VALUE); + } + + void reset() { + max->reset(EXTREMELY_SMALL_VALUE); + min->reset(EXTREMELY_LARGE_VALUE); + success = true; + numFailedTests = 0; + } + ~Report() { + delete max; + delete min; + } +}; + +#endif // _RESULT_STRUCT_H_ diff --git a/projects/clr/opencl/tests/ocltst/env/Timer.cpp b/projects/clr/opencl/tests/ocltst/env/Timer.cpp new file mode 100644 index 0000000000..0b8baad859 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/env/Timer.cpp @@ -0,0 +1,111 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "Timer.h" + +#ifdef ATI_OS_WIN +#include +#endif + +#ifdef ATI_OS_LINUX +#include +#endif + +CPerfCounter::CPerfCounter() : _clocks(0), _start(0) { +#ifdef ATI_OS_WIN + + QueryPerformanceFrequency((LARGE_INTEGER *)&_freq); + +#endif + +#ifdef ATI_OS_LINUX + _freq = 1000; +#endif +} + +CPerfCounter::~CPerfCounter() { + // EMPTY! +} + +void CPerfCounter::Start(void) { +#ifdef ATI_OS_WIN + + if (_start) { + MessageBox(NULL, "Bad Perf Counter Start", "Error", MB_OK); + exit(0); + } + QueryPerformanceCounter((LARGE_INTEGER *)&_start); + +#endif +#ifdef ATI_OS_LINUX + + struct timeval s; + gettimeofday(&s, 0); + _start = (i64)s.tv_sec * 1000 + (i64)s.tv_usec / 1000; + +#endif +} + +void CPerfCounter::Stop(void) { + i64 n; + +#ifdef ATI_OS_WIN + + if (!_start) { + MessageBox(NULL, "Bad Perf Counter Stop", "Error", MB_OK); + exit(0); + } + + QueryPerformanceCounter((LARGE_INTEGER *)&n); + +#endif +#ifdef ATI_OS_LINUX + + struct timeval s; + gettimeofday(&s, 0); + n = (i64)s.tv_sec * 1000 + (i64)s.tv_usec / 1000; + +#endif + + n -= _start; + _start = 0; + _clocks += n; +} + +void CPerfCounter::Reset(void) { +#ifdef ATI_OS_WIN + if (_start) { + MessageBox(NULL, "Bad Perf Counter Reset", "Error", MB_OK); + exit(0); + } +#endif + _clocks = 0; +} + +double CPerfCounter::GetElapsedTime(void) { +#ifdef ATI_OS_WIN + if (_start) { + MessageBox(NULL, "Trying to get time while still running.", "Error", MB_OK); + exit(0); + } +#endif + + return (double)_clocks / (double)_freq; +} diff --git a/projects/clr/opencl/tests/ocltst/env/Timer.h b/projects/clr/opencl/tests/ocltst/env/Timer.h new file mode 100644 index 0000000000..058e00c44f --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/env/Timer.h @@ -0,0 +1,46 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _TIMER_H_ +#define _TIMER_H_ + +#ifdef ATI_OS_WIN +typedef __int64 i64; +#endif +#ifdef ATI_OS_LINUX +typedef long long i64; +#endif + +class CPerfCounter { + public: + CPerfCounter(); + ~CPerfCounter(); + void Start(void); + void Stop(void); + void Reset(void); + double GetElapsedTime(void); + + private: + i64 _freq; + i64 _clocks; + i64 _start; +}; + +#endif // _TIMER_H_ diff --git a/projects/clr/opencl/tests/ocltst/env/Worker.h b/projects/clr/opencl/tests/ocltst/env/Worker.h new file mode 100644 index 0000000000..b9e29d370b --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/env/Worker.h @@ -0,0 +1,180 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef OCL_TEST_WORKER_H +#define OCL_TEST_WORKER_H + +///////////////////////////////////////////////////////////////////////////// + +#include +#include + +#include +#include +#include +#include + +#include "Module.h" +#include "OCLTest.h" +#include "OCLTestList.h" +#include "ResultStruct.h" +#include "Timer.h" +#include "getopt.h" +#include "pfm.h" + +///////////////////////////////////////////////////////////////////////////// + +typedef void* (*TestMethod)(void* param); + +///////////////////////////////////////////////////////////////////////////// + +class Worker { + public: + Worker() + : m_wrapper(0), + m_module(0), + m_run(0), + m_id(0), + m_subtest(0), + m_testindex(0), + m_dump(false), + m_display(false), + m_useCPU(false), + m_window(0), + m_width(0), + m_height(0), + m_buffer(0), + m_perflab(false), + m_deviceId(0), + m_platform(0) { + // EMPTY! + } + + Worker(OCLWrapper* wrapper, Module* module, TestMethod run, unsigned int id, + unsigned int subtest, unsigned int testindex, bool dump, bool view, + bool useCPU, void* window, unsigned int x, unsigned int y, + bool perflab, unsigned int deviceId = 0, unsigned int platform = 0) + : m_wrapper(wrapper), + m_module(module), + m_run(run), + m_id(id), + m_subtest(subtest), + m_testindex(testindex), + m_dump(dump), + m_display(view), + m_useCPU(useCPU), + m_window(window), + m_width(x), + m_height(y), + m_buffer(0), + m_perflab(perflab), + m_deviceId(deviceId), + m_platform(platform) { + if (m_dump == true || m_display == true) { + m_buffer = new float[4 * m_width * m_height]; + if (m_buffer != 0) { + memset(m_buffer, 0, 4 * m_width * m_height * sizeof(float)); + } else { + m_dump = false; + m_display = false; + } + } + m_result = new TestResult(0.0f); + } + + Worker(const Worker& w) { + if (this == &w) return; + + if (m_buffer) delete[] m_buffer; + m_buffer = 0; + + m_wrapper = w.m_wrapper; + m_module = w.m_module; + m_run = w.m_run; + m_id = w.m_id; + m_subtest = w.m_subtest; + m_testindex = w.m_testindex; + m_dump = w.m_dump; + m_display = w.m_display; + m_useCPU = w.m_useCPU; + m_window = w.m_window; + m_width = w.m_width; + m_height = w.m_height; + m_perflab = w.m_perflab; + m_deviceId = w.m_deviceId; + m_result = w.m_result; + m_platform = w.m_platform; + + if (w.m_buffer) { + m_buffer = new float[4 * m_width * m_height]; + if (m_buffer != 0) { + memcpy(m_buffer, w.m_buffer, 4 * m_width * m_height * sizeof(float)); + } + } + } + + ~Worker() { + if (m_buffer) delete[] m_buffer; + m_buffer = 0; + delete m_result; + m_result = 0; + } + + OCLWrapper* getOCLWrapper() { return m_wrapper; } + Module* getModule() { return m_module; } + TestMethod getTestMethod() { return m_run; } + unsigned int getId() { return m_id; } + unsigned int getSubTest() { return m_subtest; } + unsigned int getTestIndex() { return m_testindex; } + bool isDumpEnabled() { return m_dump; } + bool isDisplayEnabled() { return m_display; } + bool isCPUEnabled() { return m_useCPU; } + void* getWindow() { return m_window; } + unsigned int getWidth() { return m_width; } + unsigned int getHeight() { return m_height; } + float* getBuffer() { return m_buffer; } + bool getPerflab() { return m_perflab; } + unsigned int getDeviceId() { return m_deviceId; } + TestResult* getResult() { return m_result; } + unsigned int getPlatformID() { return m_platform; } + + private: + OCLWrapper* m_wrapper; + Module* m_module; + TestMethod m_run; + unsigned int m_id; + unsigned int m_subtest; + unsigned int m_testindex; + bool m_dump; + bool m_display; + bool m_useCPU; + void* m_window; + unsigned int m_width; + unsigned int m_height; + float* m_buffer; + bool m_perflab; + unsigned int m_deviceId; + unsigned int m_platform; + TestResult* m_result; +}; + +///////////////////////////////////////////////////////////////////////////// + +#endif // OCL_TEST_WORKER_H diff --git a/projects/clr/opencl/tests/ocltst/env/oclsysinfo.cpp b/projects/clr/opencl/tests/ocltst/env/oclsysinfo.cpp new file mode 100644 index 0000000000..02e2a0402d --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/env/oclsysinfo.cpp @@ -0,0 +1,162 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "oclsysinfo.h" + +#include +#include + +#include + +#ifndef MAX_DEVICES +#define MAX_DEVICES 16 +#endif // MAX_DEVICES + +int oclSysInfo(std::string &info_string, bool use_cpu, unsigned dev_id, + unsigned int platformIndex) { + /* + * Have a look at the available platforms and pick the one + * in the platforms vector in index "platformIndex". + */ + + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + + int error = clGetPlatformIDs(0, NULL, &numPlatforms); + if (CL_SUCCESS != error) { + fprintf(stderr, "clGetPlatformIDs() failed"); + return 0; + } + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error = clGetPlatformIDs(numPlatforms, platforms, NULL); + if (CL_SUCCESS != error) { + fprintf(stderr, "clGetPlatformIDs() failed"); + return 0; + } +#if 0 + for (unsigned i = 0; i < numPlatforms; ++i) { + /* Get the number of requested devices */ + error = clGetDeviceIDs(platforms[i], (use_cpu) ? CL_DEVICE_TYPE_CPU : CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices ); +#if 0 + /* clGetDeviceIDs fails when no GPU devices are present */ + if (error) { + fprintf(stderr, "clGetDeviceIDs failed: %d\n", error ); + return 0; + } +#endif +#if 0 + char pbuf[100]; + + error = clGetPlatformInfo( + platforms[i], + CL_PLATFORM_VENDOR, + sizeof(pbuf), + pbuf, + NULL); + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + platform = platforms[i]; + break; + } +#else + /* Select platform with GPU devices present */ + if (num_devices > 0) { + platform = platforms[i]; + break; + } +#endif + } +#endif + error = clGetDeviceIDs(platforms[platformIndex], + (use_cpu) ? CL_DEVICE_TYPE_CPU : CL_DEVICE_TYPE_GPU, + 0, NULL, &num_devices); + if (error) { + fprintf(stderr, "clGetDeviceIDs failed: %d\n", error); + return 0; + } + platform = platforms[platformIndex]; + delete[] platforms; + } + if (dev_id >= num_devices) { + fprintf(stderr, "Device selected does not exist.\n"); + return 0; + } + if (NULL == platform) { + fprintf(stderr, + "Couldn't find platform with GPU devices, cannot proceed.\n"); + return 0; + } + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + if (!devices) { + fprintf(stderr, "no devices\n"); + return 0; + } + + /* Get the requested device */ + error = clGetDeviceIDs(platform, + (use_cpu) ? CL_DEVICE_TYPE_CPU : CL_DEVICE_TYPE_GPU, + num_devices, devices, NULL); + if (error) { + fprintf(stderr, "clGetDeviceIDs failed: %d\n", error); + return 0; + } + + device = devices[dev_id]; + + char c[1024]; + char tmpString[256]; + static const char *no_yes[] = {"NO", "YES"}; + sprintf(tmpString, "\nCompute Device info:\n"); + info_string.append(tmpString); + clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(c), &c, NULL); + sprintf(tmpString, "\tPlatform Version: %s\n", c); + info_string.append(tmpString); + clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(c), &c, NULL); + sprintf(tmpString, "\tDevice Name: %s\n", c); + info_string.append(tmpString); + clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(c), &c, NULL); + sprintf(tmpString, "\tVendor: %s\n", c); + info_string.append(tmpString); + clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(c), &c, NULL); + sprintf(tmpString, "\tDevice Version: %s\n", c); + info_string.append(tmpString); + clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(c), &c, NULL); + sprintf(tmpString, "\tDriver Version: %s\n", c); + info_string.append(tmpString); + clGetDeviceInfo(device, CL_DEVICE_BOARD_NAME_AMD, sizeof(c), &c, NULL); + sprintf(tmpString, "\tBoard Name: %s\n", c); + info_string.append(tmpString); +#if defined(ATI_OS_LINUX) + cl_device_topology_amd topology; + clGetDeviceInfo(device, CL_DEVICE_TOPOLOGY_AMD, sizeof(topology), &topology, + NULL); + if (topology.raw.type == CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD) { + sprintf(tmpString, "\tDevice Topology: PCI[ B#%d, D#%d, F#%d]\n", + topology.pcie.bus, topology.pcie.device, topology.pcie.function); + info_string.append(tmpString); + } +#endif + free(devices); + return 1; +} diff --git a/projects/clr/opencl/tests/ocltst/env/oclsysinfo.h b/projects/clr/opencl/tests/ocltst/env/oclsysinfo.h new file mode 100644 index 0000000000..4fd1fa2d16 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/env/oclsysinfo.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCLSYSINFO_H_ +#define _OCLSYSINFO_H_ +#include + +int oclSysInfo(std::string& info_string, bool useCPU, unsigned dev_id, + unsigned int platformIndex = 0); + +#endif //_OCLSYSINFO_H_ diff --git a/projects/clr/opencl/tests/ocltst/env/ocltst.cpp b/projects/clr/opencl/tests/ocltst/env/ocltst.cpp new file mode 100644 index 0000000000..888059fce7 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/env/ocltst.cpp @@ -0,0 +1,1611 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +///////////////////////////////////////////////////////////////////////////// + +#include + +#ifdef ATI_OS_WIN +#include + +#include "Window.h" +typedef HMODULE ModuleHandle; +#endif + +///////////////////////////////////////////////////////////////////////////// + +#ifdef ATI_OS_LINUX +#include +typedef void* ModuleHandle; +#endif + +///////////////////////////////////////////////////////////////////////////// + +#include "BaseTestImp.h" +#include "Module.h" +#include "OCLLog.h" +#include "OCLTest.h" +#include "OCLTestImp.h" +#include "OCLTestList.h" +#include "OCLWrapper.h" +#include "Timer.h" +#include "Worker.h" +#include "getopt.h" +#include "oclsysinfo.h" +#include "pfm.h" + +//! Including OCLutilities Thread utility +#include "OCL/Thread.h" + +//! Lock that needs to be obtained to access the global +//! module variable +static OCLutil::Lock moduleLock; + +#include +#include + +#include +#include +#include +#include + +///////////////////////////////////////////////////////////////////////////// + +#ifdef ATI_OS_WIN +static LONG WINAPI xFilter(LPEXCEPTION_POINTERS xEP); +void serviceStubCall(); +#endif + +#define MAX_DEVICES 16 +#undef CHECK_RESULT +#define CHECK_RESULT(test, msg) \ + if ((test)) { \ + printf("\n%s\n", msg); \ + exit(1); \ + } + +//! Declaration of a function that find devices of a specific type for the +//! chosen platform +int findAdapters(unsigned int platformIdx, bool useCPU, cl_platform_id*); + +//! class App that is used to run the tests on the system +class App { + public: + static bool m_reRunFailed; + static bool m_svcMsg; + //! Constructor for App + App(unsigned int platform) + : m_list(false), + m_console(true), + m_useCPU(false), + m_dump(false), + m_perflab(false), + m_noSysInfoPrint(false), + m_numItr(1), + mp_testOrder(NULL), + m_rndOrder(false), + m_spawned(0), + m_threads(1), + m_runthread(0), + m_width(512), + m_height(512), + m_window(0), + m_platform(platform) { + // initialize OCLWrapper reference + m_wrapper = new OCLWrapper(); + + // m_workers = Set of worker objects that are used to run a subtest from a + // module + for (unsigned int i = 0; i < 256; i++) m_workers[i] = 0; + + // Setting the number of devices + /* + * Force caltst to use 1 thread at a time in Windows + * only contextual calls are thread safe currently + */ + m_numDevices = findAdapters(m_platform, m_useCPU, NULL); + // m_numDevices = 1; + + // Report structure used to store the results of the tests +#if 0 + testReport = (Report **)malloc(sizeof(Report *) * m_numDevices); + for(unsigned int i = 0; i < m_numDevices; i++) + { + testReport[i] = new Report; + } +#else + testReport = (Report**)malloc(sizeof(Report*)); + testReport[0] = new Report; +#endif + } + + //! Destructor for App + ~App() { + // Deleting the Worker objects + for (unsigned int i = 0; i < 256; i++) { + if (m_workers[i]) { + delete m_workers[i]; + m_workers[i] = 0; + } + } + + // Deleting the report structures + // for(unsigned int i = 0; i < m_numDevices; i++) + for (unsigned int i = 0; i < 1; i++) { + delete testReport[i]; + } + free(testReport); + m_wrapper->clUnloadPlatformAMD(mpform_id); + + delete m_wrapper; + } + + //! Function used to create a worker object corresponding to a subtest in a + //! module + void SetWorker(unsigned int index, OCLWrapper* wrapper, Module* module, + TestMethod run, unsigned int id, unsigned int subtest, + unsigned int test, bool dump, bool view, bool useCPU, + void* window, unsigned int x, unsigned int y, bool perflab, + unsigned int deviceId, unsigned int platform) { + if (index >= 256) return; + + if (m_workers[index]) delete m_workers[index]; + + m_workers[index] = + new Worker(wrapper, module, run, id, subtest, test, dump, view, useCPU, + window, x, y, perflab, deviceId, platform); + + assert(m_workers[index] != 0); + // oclTestLog(OCLTEST_LOG_ALWAYS, "Worker Device Id = %d\n", + // m_workers[index]->getDeviceId()); + } + + //! Function to return the 'index'th m_workers + Worker* GetWorker(unsigned int index) { + if (index >= 256) return 0; + + return m_workers[index]; + } + + //! Create a thread to run the subtest + void AddThread(unsigned int workerindex, unsigned int usage) { + Worker* worker = GetWorker(workerindex); + if (worker == 0) { + return; + } + + // usage = Whether to use threads or not + if (usage != 0) { + // Creating a thread + // getTestMethod = runSubTest here + // which takes a Worker object as an argument + m_pool[workerindex].create(worker->getTestMethod(), (void*)(worker)); + m_spawned++; + } else { + // Same as above without using threads + TestMethod run = worker->getTestMethod(); + if (run) { + run(worker); + UpdateTestReport(workerindex, worker->getResult()); + } + } + return; + } + + //! Function which waits for all threads to execute and also updates the + //! report + void WaitAllThreads() { + for (unsigned int w = 0; w < m_spawned; w++) { + m_pool[w].join(); + UpdateTestReport(w, m_workers[w]->getResult()); + } + m_spawned = 0; + } + + //! Function to add a worker thread so as to run a subtest of a module + //! @param run = runSubtest function + //! @param index = index of the module in m_modules + //! @param subtest = the subtest number to run + //! @param usage = whether to use threads or not + //! @param test = The test in the module to be executed + void AddWorkerThread(unsigned int index, unsigned int subtest, + unsigned int test, unsigned int usage, TestMethod run) { + if (m_spawned > m_threads) { + WaitAllThreads(); + } + + // Creating a worker thread for each device +#if 0 + for(unsigned int i = 0; i < m_numDevices; i++) + { + SetWorker(i, + m_wrapper, + &m_modules[index], + run, + m_spawned, + subtest, + test, + m_dump, + !m_console, + m_useCPU, + m_window, + m_width, + m_height, + m_perflab, + i, + m_platform); + } +#else + for (unsigned int i = 0; i < 1; i++) { + SetWorker(i, m_wrapper, &m_modules[index], run, m_spawned, subtest, test, + m_dump, !m_console, m_useCPU, m_window, m_width, m_height, + m_perflab, m_deviceId, m_platform); + } +#endif + + // Creating and executing a thread for each device + // for(unsigned int i = 0; i < m_numDevices; i++) + for (unsigned int i = 0; i < 1; i++) { + AddThread(i, usage); + } + } + + void printOCLinfo(void); + + //! Function to process the commandline arguments + void CommandLine(unsigned int argc, char** argv); + + //! Function to scan for the different tests in the module + void ScanForTests(); + + //! Function to run all the specified tests + void RunAllTests(); + + //! Free memory + void CleanUp(); + + //! Function to set the order in which test are executed. + void SetTestRunOrder(int); + + //! Function to print the test order + void PrintTestOrder(int); + + //! Function to get the number of iterations. + int GetNumItr(void) { return m_numItr; } + + private: + typedef std::vector TestIndexList; + typedef std::vector StringList; + + void AddToList(StringList& strlist, const char* str); + void LoadList(StringList& strlist, const char* filename); + + bool TestInList(StringList& strlist, const char* testname); + + //! Array storing the report for each device + Report** testReport; + + //! Function to update the result of each device + void UpdateTestReport(int index, TestResult* result) { + if (result != NULL) { + if (result->passed) { + if (testReport[index]->max->value < result->value) { + testReport[index]->max->value = result->value; + testReport[index]->max->resultString = result->resultString; + } + if (testReport[index]->min->value > result->value) { + testReport[index]->min->value = result->value; + testReport[index]->min->resultString = result->resultString; + } + } else { + testReport[index]->numFailedTests++; + testReport[index]->success = false; + } + } else { + testReport[index]->numFailedTests++; + testReport[index]->success = false; + } + } + + //! Functions used to find the range of the tests to be run + void GetTestIndexList(TestIndexList& testIndices, StringList& testList, + const char* szModuleTestname, int maxIndex); + void PruneTestIndexList(TestIndexList& testIndices, + TestIndexList& avoidIndices, + TestIndexList& erasedIndices); + + StringList m_paths; + StringList m_tests; + StringList m_avoid; + std::vector m_modules; + bool m_list; + bool m_console; + bool m_useCPU; + bool m_dump; + bool m_perflab; + bool m_noSysInfoPrint; + int m_numItr; + int* mp_testOrder; + bool m_rndOrder; + + //! m_pool = Various threads created to execute tests on multiple devices + OCLutil::Thread m_pool[256]; + + Worker* m_workers[256]; + + //! Number of threads spawned + unsigned int m_spawned; + + //! Upper limit on the number of threads that can be spawned + unsigned int m_threads; + unsigned int m_runthread; + unsigned int m_width; + unsigned int m_height; + void* m_window; + + //! which index/platform id from the platforms vector returned by + //! cl::Platform::get we should run on + unsigned int m_platform; + cl_platform_id mpform_id; + + //! Number of devices on the system + unsigned int m_numDevices; + // + //! Device ID to use on the system + unsigned int m_deviceId; + + // OCLWrapper reference + OCLWrapper* m_wrapper; +}; + +void App::printOCLinfo(void) { + std::string calinfo; + if (!m_noSysInfoPrint) { + oclSysInfo(calinfo, m_useCPU, m_deviceId, m_platform); + oclTestLog(OCLTEST_LOG_ALWAYS, calinfo.c_str()); + } +} + +/*----------------------------------------------------- +Function to randomize the order in which tests are executed +-------------------------------------------------------*/ +#ifdef ATI_OS_WIN +#include +#endif +// void App::SetTestRunOrder(int test_count) +void App::SetTestRunOrder(int mod_index) { + assert(mp_testOrder != NULL); + unsigned int test_count = m_modules[mod_index].get_count(); + + StringList uniqueTests; + for (unsigned int i = 0; i < m_tests.size(); ++i) { + // see if the tests are being run using indices + size_t nFirstBracket = m_tests[i].find("["); + // set the test name + std::string szTestName = m_tests[i]; + + // order of execution is set based on base name so get the base name + if (nFirstBracket != std::string::npos) + szTestName = szTestName.substr(0, nFirstBracket); + + bool bTestExists = false; + for (unsigned int j = 0; j < uniqueTests.size(); ++j) { + if (strcmp(szTestName.c_str(), uniqueTests[j].c_str()) == 0) { + bTestExists = true; + break; + } + } + + if (!bTestExists) { + AddToList(uniqueTests, szTestName.c_str()); + } + } + + for (unsigned int i = 0; i < test_count && i < uniqueTests.size(); i++) { + for (unsigned int j = 0; j < test_count; j++) { + unsigned int index = i; + // add all the prev test indices + for (int k = 0; k < mod_index; k++) index += m_modules[k].get_count(); + + std::string szTestName = uniqueTests[index]; + + if (strcmp(szTestName.c_str(), m_modules[mod_index].get_name(j)) == 0) { + mp_testOrder[i] = j; + break; + } + } + } + + if (m_rndOrder) { + srand((unsigned int)time(NULL)); + for (unsigned int i = 0; i < test_count; i++) { + // find two random indices + int index1 = (int)((float)test_count * (rand() / (RAND_MAX + 1.0))); + int index2 = (int)((float)test_count * (rand() / (RAND_MAX + 1.0))); + // swap the data + int tmp = mp_testOrder[index1]; + mp_testOrder[index1] = mp_testOrder[index2]; + mp_testOrder[index2] = tmp; + } + } +} + +///////////////////////////////////////////////////////////////////////////// + +// Process device string. Returns true if there is a primary ATI Radeon device +// adapter, false otherwise +static bool procDevString(const char* devString) { + // Search for the string "Radeon" inside the device string + if (strstr(devString, "Radeon") || strstr(devString, "R600") || + strstr(devString, "RV630") || strstr(devString, "RV670") || + (strstr(devString, "Stream") && strstr(devString, "Processor"))) { + // Ignore if the device is a secondary device, i.e., not an adapter + if (strstr(devString, "Secondary")) { + return false; + } + } else { + return false; + } + + return true; +} + +//! +//! Function to find the total number of adapters on the system +//! +int findAdapters(unsigned int platformIdx, bool useCPU, + cl_platform_id* mpform) { + unsigned int numOfAdapters = 0; + cl_int error = 0; + cl_uint numPlatforms = 0; + + error = clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT((error != CL_SUCCESS), "clGetPlatformIDs failed"); + + CHECK_RESULT((platformIdx >= numPlatforms), "Invalid platform"); + + cl_platform_id* platforms = new cl_platform_id[numPlatforms]; + error = clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error != CL_SUCCESS, "clGetPlatformIDs failed"); + + cl_platform_id platform = 0; + + platform = platforms[platformIdx]; + + delete[] platforms; + + cl_device_type devType = CL_DEVICE_TYPE_GPU; + if (useCPU) devType = CL_DEVICE_TYPE_CPU; + error = clGetDeviceIDs(platform, devType, 0, 0, &numOfAdapters); + CHECK_RESULT((error != CL_SUCCESS), "clGetDeviceIDs failed"); + if (mpform) { + (*mpform) = platform; + } + + return (int)numOfAdapters; +} + +int calibrate(OCLTest* test) { + int n = 1; + +#if 0 + while (1) + { + double timer = run(test, n); + if (timer > 2.) + { + break; + } + n *= 2; + } +#endif + + return n; +} + +void* dummyThread(void* argv) { + unsigned int counter = 0; + while (counter < 1000000) counter++; + + return argv; +} + +//! Function used to run the test specified +//! It would look something like OCLPerfInputspeed[0] +double run(OCLTest* test, int passes) { + CPerfCounter counter; + + counter.Reset(); + counter.Start(); + int i; + for (i = 0; i < passes; i++) { + test->run(); + } + counter.Stop(); + double timer = counter.GetElapsedTime(); + counter.Reset(); + + return timer; +} + +//! Function to display the result after a test is finished +//! It also stores the result in a TestResult object +void report(Worker* w, const char* testname, int testnum, unsigned int crc, + const char* errorMsg, float timer, TestResult* tr, + const char* testDesc) { + unsigned int thread = w->getId(); + bool perflab = w->getPerflab(); + unsigned int deviceId = w->getDeviceId(); + + char tmpUnits[256]; + if (perflab) { + oclTestLog(OCLTEST_LOG_ALWAYS, "%10.3f\n", timer); + } else { + const char* passedOrFailed[] = {"FAILED", "PASSED"}; + + // char teststring[256]; + // sprintf(teststring, "%s[%d]", testname, testnum); + // sprintf(tmpUnits, "Device[%d]:\t%-32s:\t%s\n", deviceId, teststring, + // ((tr->passed) ? passedOrFailed[1] : passedOrFailed[0])); + // If crc is not 0 or errorMsg is not empty, print the full stats + if ((crc != 0) || (errorMsg && (errorMsg[0] != '\0'))) { + sprintf(tmpUnits, + "%s %s: %s[%d] T[%1d] [%3d], %10.3f %-20s (chksum 0x%08x)\n", + testDesc, ((tr->passed) ? passedOrFailed[1] : passedOrFailed[0]), + w->isCPUEnabled() ? "CPU" : "GPU", deviceId, thread, testnum, + timer, errorMsg, crc); + } else { + sprintf(tmpUnits, "%s %s: %s[%d] T[%1d] [%3d], %10.3f\n", testDesc, + ((tr->passed) ? passedOrFailed[1] : passedOrFailed[0]), + w->isCPUEnabled() ? "CPU" : "GPU", deviceId, thread, testnum, + timer); + } + + oclTestLog(OCLTEST_LOG_ALWAYS, tmpUnits); + + tr->value = timer; + tr->resultString.assign(tmpUnits); + + if (App::m_svcMsg && !tr->passed) { + char escaped[2 * sizeof(tmpUnits)]; + + char* ptr = escaped; + for (int i = 0; tmpUnits[i] != '\0'; ++i) { + switch (tmpUnits[i]) { + case '\n': + *ptr++ = '|'; + *ptr++ = 'n'; + break; + case '\r': + *ptr++ = '|'; + *ptr++ = 'r'; + break; + case '\'': + case '|': + case ']': + case '[': + *ptr++ = '|'; + default: + *ptr++ = tmpUnits[i]; + } + } + *ptr = '\0'; + + oclTestLog(OCLTEST_LOG_ALWAYS, + "##teamcity[testFailed name='%s.%s.%d' message='FAILED' " + "details='%s']\n", + w->getModule()->get_libname(), testname, testnum, escaped); + } + } +} + +//! Thread Entry point +void* runSubtest(void* worker) { + char units[256]; + double conversion; + unsigned int crc = 0; + bool second_run = false; + + // Getting the worker object that is running in this thread + Worker* w = (Worker*)worker; + + if (w == 0) return NULL; + + unsigned int test = w->getTestIndex(); + unsigned int subtest = w->getSubTest(); + unsigned int deviceId = w->getDeviceId(); + unsigned int platformIndex = w->getPlatformID(); + TestResult* result = w->getResult(); + +RERUN_TEST: + // Acquiring lock on the 'module' object common to all threads + moduleLock.lock(); + Module* m = w->getModule(); + if (m == 0 || m->create_test == 0) return NULL; + // If we can, used the cached version, + // otherwise create the test. + OCLTest* pt = (m->cached_test ? m->cached_test[subtest] : NULL); + if (!pt) { + pt = m->create_test(subtest); + if (pt->cache_test() && m->cached_test) { + m->cached_test[subtest] = pt; + } + } + pt->clearError(); + OCLTestImp* tmp = pt->toOCLTestImp(); + if (tmp) { + tmp->setOCLWrapper(w->getOCLWrapper()); + } + std::string subtestName = m->get_name(subtest); + moduleLock.unlock(); + + if (pt == 0) return NULL; + + pt->resetDescString(); + if (App::m_svcMsg) { + oclTestLog(OCLTEST_LOG_ALWAYS, + "##teamcity[testStarted name='%s.%s.%d' " + "captureStandardOutput='true']\n", + m->get_libname(), subtestName.c_str(), test); + } + // setting the type to CPU. + if (w->isCPUEnabled()) { + pt->useCPU(); + } + // Setting the device according to the worker thread + pt->setDeviceId(w->getDeviceId()); + pt->setPlatformIndex(w->getPlatformID()); + // Opening the 'test'th subtest of 'pt' + pt->open(test, units, conversion, deviceId); + pt->clearPerfInfo(); + + char buffer[256]; + sprintf(buffer, "%s[%3d]", subtestName.c_str(), test); + oclTestLog(OCLTEST_LOG_ALWAYS, "%-32s", buffer); + + if (pt->hasErrorOccured()) { + result->passed = false; + report(w, subtestName.c_str(), test, crc, pt->getErrorMsg(), + pt->getPerfInfo(), result, pt->testDescString.c_str()); + } else { + unsigned int n = calibrate(pt); + double timer = run(pt, n); + crc = pt->close(); + + if (pt->hasErrorOccured()) { + // run second time if the test fails the first time. + if (!second_run && App::m_reRunFailed && !App::m_svcMsg) { + second_run = true; + + // Destroying a test object + moduleLock.lock(); + if (!pt->cache_test()) { + m->destroy_test(pt); + } + moduleLock.unlock(); + + pt->clearError(); + goto RERUN_TEST; + } + } + result->passed = !pt->hasErrorOccured(); + /// print conditional pass if it is passes the second time. + if (second_run && result->passed) { + report(w, subtestName.c_str(), test, crc, "Conditional PASS", + pt->getPerfInfo(), result, pt->testDescString.c_str()); + } else { + report(w, subtestName.c_str(), test, crc, pt->getErrorMsg(), + pt->getPerfInfo(), result, pt->testDescString.c_str()); + } + } + if (App::m_svcMsg) { + oclTestLog(OCLTEST_LOG_ALWAYS, "##teamcity[testFinished name='%s.%s.%d']\n", + m->get_libname(), subtestName.c_str(), test); + } + + // Make sure we clear the error after we report that there was an error. + pt->clearError(); + + // Destroying a test object + moduleLock.lock(); + if (!pt->cache_test()) { + m->destroy_test(pt); + } + moduleLock.unlock(); + return NULL; +} + +void App::PrintTestOrder(int mod_index) { + oclTestLog(OCLTEST_LOG_ALWAYS, "Module: %s (%d tests)\n", + m_modules[mod_index].name.c_str(), + m_modules[mod_index].get_count()); + + for (unsigned int j = 0; j < m_modules[mod_index].get_count(); j++) { + oclTestLog(OCLTEST_LOG_ALWAYS, "%s\n", + m_modules[mod_index].get_name(mp_testOrder[j])); + } +} + +//! Function that runs all the tests specified in the command-line +void App::RunAllTests() { +#ifdef ATI_OS_WIN + + if (!m_console) m_window = new Window("Test", 100, 100, m_width, m_height, 0); +#endif + + // + // Add all tests to run list if none specified + // + if (m_tests.size() < 1) { + for (unsigned int i = 0; i < m_modules.size(); i++) { + for (unsigned int j = 0; j < m_modules[i].get_count(); j++) { + AddToList(m_tests, m_modules[i].get_name(j)); + } + } + } + + unsigned int num_passes = 0; + unsigned int num_failures = 0; + + if (App::m_svcMsg) { + oclTestLog(OCLTEST_LOG_ALWAYS, + "##teamcity[testSuiteStarted name='ocltst']\n"); + } + + // + // Run each test + // + for (unsigned int i = 0; i < m_modules.size(); i++) { + oclTestLog(OCLTEST_LOG_ALWAYS, + "\n-------------------------------------------------\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, + "The OpenCL Testing Module %s Version = %d \n", + m_modules[i].get_libname(), m_modules[i].get_version()); + oclTestLog(OCLTEST_LOG_ALWAYS, "------------------------------\n"); + + // array to keep track of order of test execution. + int test_count = m_modules[i].get_count(); + mp_testOrder = new int[test_count]; + memset((void*)mp_testOrder, 0, sizeof(*mp_testOrder) * test_count); + SetTestRunOrder(i); + + // + // List all tests first if the option was turned on + // + if (m_list) { + PrintTestOrder(i); + delete[] mp_testOrder; + continue; + // return; + } + + for (unsigned int itr_var = 0; itr_var < m_modules[i].get_count(); + itr_var++) { + // done for random order generation + unsigned int subtest = mp_testOrder[itr_var]; + + const char* name = m_modules[i].get_name(subtest); + if (itr_var < m_tests.size() && TestInList(m_tests, name)) { + OCLTest* pt = NULL; + if (m_modules[i].cached_test) { + pt = m_modules[i].cached_test[subtest]; + } + // Try to use the cached version first! + if (!pt) { + pt = m_modules[i].create_test(subtest); + if (pt->cache_test() && m_modules[i].cached_test) { + m_modules[i].cached_test[subtest] = pt; + } + } + + int numSubTests = pt->getNumSubTests(); + assert(numSubTests > 0); + + TestIndexList testIndices; + GetTestIndexList(testIndices, m_tests, name, numSubTests - 1); + + TestIndexList avoidIndices; + GetTestIndexList(avoidIndices, m_avoid, name, numSubTests - 1); + + TestIndexList erasedIndices; + PruneTestIndexList(testIndices, avoidIndices, erasedIndices); + + int numTestsRun = 0; + for (unsigned int j = 0; j < testIndices.size(); j++) { + unsigned int test = testIndices[j]; + + WaitAllThreads(); + AddWorkerThread(i, subtest, test, pt->getThreadUsage(), runSubtest); + + for (unsigned int thread = 1; + (thread < m_threads) && (thread < m_modules.size()); thread++) { + AddWorkerThread(thread, subtest, test, pt->getThreadUsage(), + dummyThread); + } + + numTestsRun++; + } + + WaitAllThreads(); + // Printing the test report + // First checking whether the number of subtests is greater than 1. + // No point printing report for a one subtest test + + if (numTestsRun > 0) { + if (testReport[0]->success) { + num_passes++; + } else { + num_failures++; + } + } + if (App::m_svcMsg) { + for (unsigned int j = 0; j < erasedIndices.size(); j++) { + oclTestLog(OCLTEST_LOG_ALWAYS, + "##teamcity[testIgnored name='%s.%s.%d']\n", + m_modules[i].get_libname(), name, erasedIndices[j]); + } + } + + // Resetting the values of the test reports + // for(unsigned int j = 0; j < m_numDevices; j++) + for (unsigned int j = 0; j < 1; j++) { + testReport[j]->reset(); + } + m_modules[i].destroy_test(pt); + if (m_modules[i].cached_test) { + m_modules[i].cached_test[subtest] = NULL; + } + } + } + + // print the order in which the test are executed if they are + // randomized. + if (m_rndOrder) { + PrintTestOrder(i); + } + // deleting the test order + delete[] mp_testOrder; + } + + if (App::m_svcMsg) { + oclTestLog(OCLTEST_LOG_ALWAYS, + "##teamcity[testSuiteFinished name='ocltst']\n"); + } + +#ifdef ATI_OS_WIN + if (!m_console && m_window) { + ((Window*)m_window)->ConsumeEvents(); + } +#endif + float total_tests = (float)(num_passes + num_failures); + + float percent_passed = 0.0f; + float percent_failed = 0.0f; + float percent_total = 0.0f; + if (total_tests > 0) { + percent_passed = 100.0f * ((float)num_passes / total_tests); + percent_failed = 100.0f * ((float)num_failures / total_tests); + percent_total = 100.0f * ((float)total_tests / total_tests); + } + + oclTestLog(OCLTEST_LOG_ALWAYS, "\n\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, "----------------------------------------\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, "Total Passed Tests: %8d (%6.2f%s)\n", + num_passes, percent_passed, "%"); + oclTestLog(OCLTEST_LOG_ALWAYS, "Total Failed Tests: %8d (%6.2f%s)\n", + num_failures, percent_failed, "%"); + oclTestLog(OCLTEST_LOG_ALWAYS, "----------------------------------------\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, "Total Run Tests: %8d (%6.2f%s)\n", + (int)total_tests, percent_total, "%"); + oclTestLog(OCLTEST_LOG_ALWAYS, "\n\n"); +} + +///////////////////////////////////////////////////////////////////////////// + +void App::AddToList(StringList& strlist, const char* str) { + std::string s(str); + + strlist.push_back(s); +} + +void App::LoadList(StringList& strlist, const char* filename) { + char buffer[1024]; + + FILE* fp = fopen(filename, "r"); + + if (fp == NULL) return; + + while (fgets(buffer, 1000, fp) != NULL) { + size_t length = strlen(buffer); + if (length > 0) { + if (buffer[length - 1] != '\n') { + length++; + } + buffer[length - 1] = 0; + AddToList(strlist, buffer); + } + } + + fclose(fp); +} + +static void Help(const char* name) { + oclTestLog(OCLTEST_LOG_ALWAYS, + "%s (-w | -v | -m | -M | -l | -t | -T | -p | -d | -x | -y | -g| " + "-o | -n )\n", + name); + oclTestLog(OCLTEST_LOG_ALWAYS, " -w : enable window mode\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, + " -v : enable TeamCity service messages\n"); + oclTestLog( + OCLTEST_LOG_ALWAYS, + " -d : dump test output to portable float map (pfm)\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, + " -m : specify a DLL module with tests\n"); + oclTestLog( + OCLTEST_LOG_ALWAYS, + " -M : specify a text file with one DLL module per line\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, + " -l : list test names in DLL modules and exit\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, + " -s : number of threads to spawn\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, " -t : run test\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, + " -T : specify a text file with one test per line\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, + " -a : specify a test to avoid\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, + " -A : specify a text file of tests to avoid with " + "one test per line\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, + " -p : specify a platform to run on, 'amd','nvidia' " + "or 'intel'\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, " -h : this help text\n"); + oclTestLog( + OCLTEST_LOG_ALWAYS, + " -x : x dimension for debug output image (and window)\n"); + oclTestLog( + OCLTEST_LOG_ALWAYS, + " -y : y dimension for debug output image (and window)\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, + " -P : Perflab mode (just print the result without " + "any supplementary information)\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, + " -n #number : run the tests specified with -m, -M, -t or -T " + "options multiple times\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, + " -r : Option to Randomize the order in which the " + "tests are executed.\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, + " -R : Option to ReRun failed tests for conditional " + "pass.\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, + " -i : Don't print system information\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, + " -g : GPUid to run the tests on\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, + " -o : dump the output to a specified file\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, + " -c : Run the test on the CPU device.\n"); + oclTestLog(OCLTEST_LOG_ALWAYS, " : \n"); + oclTestLog(OCLTEST_LOG_ALWAYS, + " : To run only one subtest of a test, append the " + "subtest to\n"); + oclTestLog( + OCLTEST_LOG_ALWAYS, + " : the end of the test name in brackets. i.e. test[1]"); + oclTestLog(OCLTEST_LOG_ALWAYS, "\n"); + + exit(0); +} + +unsigned int getPlatformID(const char* str) { + std::string strOfCLVendor(str); + std::string strOfCLPlatformName; + unsigned int platform = 0; + + // currently, the only input values amd,nvidia and intel are supported + if (strOfCLVendor == "amd") { + strOfCLPlatformName = "Advanced Micro Devices, Inc."; + } else if (strOfCLVendor == "intel") { + strOfCLPlatformName = "Intel(R) Corporation"; + } else if (strOfCLVendor == "nvidia") { + strOfCLPlatformName = "NVIDIA Corporation"; + } else { + // fall-back on platform index 0 + return platform; + } + + cl_int status; + cl_uint numPlatforms = 0; + + status = clGetPlatformIDs(0, NULL, &numPlatforms); + if (status != CL_SUCCESS) { + return platform; + } + + cl_platform_id* platforms = new cl_platform_id[numPlatforms]; + status = clGetPlatformIDs(numPlatforms, platforms, NULL); + + if (status == CL_SUCCESS) { + unsigned int i; + for (i = 0; i < numPlatforms; ++i) { + char buff[200]; + status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(buff), + buff, NULL); + if (status != CL_SUCCESS) { + break; + } + if (strcmp(buff, strOfCLPlatformName.c_str()) == 0) { + platform = i; + break; + } + } + } + + delete[] platforms; + return platform; +} + +unsigned int parseCommandLineForPlatform(unsigned int argc, char** argv) { + int c; + unsigned int platform = 0; + + while ((c = getopt(argc, argv, "dg:lm:M:o:Ps:t:T:a:A:p:v:wxy:in:rcRV")) != + -1) { + switch (c) { + case 'p': + platform = getPlatformID(optarg); + break; + default: + break; + } + } + return platform; +} + +void App::CommandLine(unsigned int argc, char** argv) { + unsigned int i = 1; + int c; + bool hasOption = false; + unsigned int tmpNumDevices = 0; + unsigned int tmpDeviceId = 0; + m_deviceId = 0; + int tmp; + + while ((c = getopt(argc, argv, "dg:lm:M:o:Ps:t:T:a:A:p:v:wxy:in:rcRV")) != + -1) { + switch (c) { + case 'c': + m_useCPU = true; + break; + + case 'p': + break; + + case 'w': + m_console = false; + hasOption = true; + break; + + case 'V': + m_svcMsg = true; + break; + + case 'd': + m_dump = true; + hasOption = true; + break; + + case 'm': + AddToList(m_paths, optarg); + hasOption = true; + break; + + case 'M': + LoadList(m_paths, optarg); + hasOption = true; + break; + + case 'a': + AddToList(m_avoid, optarg); + hasOption = true; + break; + + case 'A': + LoadList(m_avoid, optarg); + hasOption = true; + break; + + case 'l': + m_list = true; + hasOption = true; + break; + + // command line switch to loop execution of any specified test or tests n + // number of times + case 'n': + m_numItr = atoi(optarg); + break; + + // command line switch to randomize the order of test execution in OCLTest + case 'r': + m_rndOrder = true; + break; + + // command line switch to rerun the failed tests to see if they pass on + // second run + case 'R': { + m_reRunFailed = true; + break; + } + case 't': + AddToList(m_tests, optarg); + hasOption = true; + break; + + case 'T': + LoadList(m_tests, optarg); + hasOption = true; + break; + + case 's': + m_threads = atoi(optarg); + hasOption = true; + break; + + case 'h': + Help(argv[0]); + break; + + case 'x': + m_width = atoi(optarg); + hasOption = true; + break; + + case 'y': + m_height = atoi(optarg); + hasOption = true; + break; + + case 'P': + m_perflab = true; + hasOption = true; + break; + case 'g': +#if 0 + tmpNumDevices = (unsigned int)atoi(optarg); + if(m_numDevices < tmpNumDevices) + { + oclTestLog(OCLTEST_LOG_ALWAYS, "Number of Devices(%d) less than specified by the user(%d). Using %d devices.\n", m_numDevices, tmpNumDevices, m_numDevices); + } + else + { + m_numDevices = tmpNumDevices; + } +#else + tmpDeviceId = (unsigned int)atoi(optarg); +#endif + break; + case 'v': + tmp = atoi(optarg); + if (tmp >= 0 && tmp < 100) { + oclTestSetLogLevel(atoi(optarg)); + } else { + oclTestLog(OCLTEST_LOG_ALWAYS, "Invalid verbose level\n"); + } + break; + case 'o': { + hasOption = true; + oclTestEnableLogToFile(optarg); + } break; + case 'i': + m_noSysInfoPrint = true; + break; + default: + Help(argv[0]); + break; + } + } + + // Reset devices in case user overrode defaults + m_numDevices = findAdapters(m_platform, m_useCPU, &mpform_id); + if (m_numDevices < (tmpDeviceId + 1)) { + m_deviceId = 0; + oclTestLog(OCLTEST_LOG_ALWAYS, + "User specified deviceId(%d) exceedes the number of " + "Devices(%d). Using device %d.\n", + tmpDeviceId, m_numDevices, m_deviceId); + } else { + m_deviceId = tmpDeviceId; + } + + if (!hasOption) { + Help(argv[0]); + } +} + +bool App::TestInList(StringList& strlist, const char* szModuleTestname) { + if (szModuleTestname == NULL) { + return false; + } + for (unsigned int i = 0; i < strlist.size(); i++) { + // check to see if an index is specified for this test name + int nIndex = -1; + std::string szTestName = strlist[i]; + if (szTestName.find("[") != std::string::npos) { + size_t nFirstBracket = szTestName.find("["); + size_t nLastBracket = szTestName.find("]"); + if ((nFirstBracket != std::string::npos) && + (nLastBracket != std::string::npos) && + (nLastBracket > nFirstBracket)) { + szTestName = szTestName.substr(0, nFirstBracket); + } + } + if (strcmp(szModuleTestname, szTestName.c_str()) == 0) { + return true; + } + } + + return false; +} + +void App::GetTestIndexList(TestIndexList& testIndices, StringList& testList, + const char* szModuleTestname, int maxIndex) { + for (unsigned int i = 0; i < testList.size(); i++) { + IndicesRange nIndex = {0, maxIndex}; + + // If the test name string ends with [...] parse the text + // between the brackets to determine the index range. + std::string szTestName = testList[i]; + if (szTestName.find("[") != std::string::npos) { + size_t nFirstBracket = szTestName.find("["); + size_t nLastBracket = szTestName.find("]"); + if ((nFirstBracket != std::string::npos) && + (nLastBracket != std::string::npos) && + (nLastBracket > nFirstBracket)) { + // Getting the string between the brackets '[' and ']' + // The values can be one of the following:- + // [a-b] - Run tests from a to b + // [a-] - Run tests from subtest a to subtest total_tests + // [-b] - Run tests from subtest 0 to subtest b + // a and b are indices of the tests to run + + std::string nIndexString = szTestName.substr( + nFirstBracket + 1, nLastBracket - nFirstBracket - 1); + size_t nIntermediateHyphen = szTestName.find("-"); + if ((nIntermediateHyphen != std::string::npos) && + (nIntermediateHyphen < nLastBracket) && + (nIntermediateHyphen > nFirstBracket)) { + // Getting the start index + if ((nIntermediateHyphen - 1) == nFirstBracket) { + nIndex.startIndex = 0; + } else { + nIndex.startIndex = + atoi(szTestName + .substr(nFirstBracket + 1, + nIntermediateHyphen - nFirstBracket - 1) + .c_str()); + } + + // Getting the end index + if ((nIntermediateHyphen + 1) == nLastBracket) { + nIndex.endIndex = maxIndex; + } else { + nIndex.endIndex = + atoi(szTestName + .substr(nIntermediateHyphen + 1, + nLastBracket - nIntermediateHyphen - 1) + .c_str()); + } + } else { + nIndex.startIndex = atoi( + szTestName + .substr(nFirstBracket + 1, nLastBracket - nFirstBracket - 1) + .c_str()); + nIndex.endIndex = nIndex.startIndex; + } + } + + szTestName = szTestName.substr(0, nFirstBracket); + } + + if (strcmp(szModuleTestname, szTestName.c_str()) == 0) { + // If the values are out of order, swap them. + if (nIndex.startIndex > nIndex.endIndex) { + int tmp = nIndex.startIndex; + nIndex.startIndex = nIndex.endIndex; + nIndex.endIndex = tmp; + } + + // Add the indices in the specified range to the list. + for (int i = nIndex.startIndex; i <= nIndex.endIndex; ++i) { + if (i <= maxIndex) { + testIndices.push_back(i); + } else { + oclTestLog(OCLTEST_LOG_ALWAYS, + "Error: Invalid test index for subtest: %s!\n", + szModuleTestname); + } + } + + // Now sort and prune duplicates. + std::sort(testIndices.begin(), testIndices.end()); + std::unique(testIndices.begin(), testIndices.end()); + } + } +} + +void App::PruneTestIndexList(TestIndexList& testIndices, + TestIndexList& avoidIndices, + TestIndexList& erasedIndices) { + for (TestIndexList::iterator it = testIndices.begin(); + it != testIndices.end();) { + unsigned int index = *it; + TestIndexList::iterator result = + std::find(avoidIndices.begin(), avoidIndices.end(), index); + if (result != avoidIndices.end()) { + it = testIndices.erase(it); + erasedIndices.push_back(index); + } else { + ++it; + } + } +} + +void App::ScanForTests() { + for (unsigned int i = 0; i < m_paths.size(); i++) { + Module mod; + +#ifdef ATI_OS_WIN + std::string::iterator myIter; + myIter = m_paths[i].end(); + myIter--; + if (*myIter == 0x0a) m_paths[i].erase(myIter); + + mod.hmodule = LoadLibrary(m_paths[i].c_str()); +#endif +#ifdef ATI_OS_LINUX + mod.hmodule = dlopen(m_paths[i].c_str(), RTLD_NOW); +#endif + + if (mod.hmodule == NULL) { + fprintf(stderr, "Could not load module: %s\n", m_paths[i].c_str()); +#ifdef ATI_OS_LINUX + fprintf(stderr, "Error : %s\n", dlerror()); +#else +#endif + } else { + mod.name = m_paths[i]; + +#ifdef ATI_OS_WIN + mod.get_count = (TestCountFuncPtr)GetProcAddress(mod.hmodule, + "OCLTestList_TestCount"); + mod.get_name = + (TestNameFuncPtr)GetProcAddress(mod.hmodule, "OCLTestList_TestName"); + mod.create_test = (CreateTestFuncPtr)GetProcAddress( + mod.hmodule, "OCLTestList_CreateTest"); + mod.destroy_test = (DestroyTestFuncPtr)GetProcAddress( + mod.hmodule, "OCLTestList_DestroyTest"); + mod.get_version = (TestVersionFuncPtr)GetProcAddress( + mod.hmodule, "OCLTestList_TestLibVersion"); + mod.get_libname = (TestLibNameFuncPtr)GetProcAddress( + mod.hmodule, "OCLTestList_TestLibName"); +#endif +#ifdef ATI_OS_LINUX + mod.get_count = + (TestCountFuncPtr)dlsym(mod.hmodule, "OCLTestList_TestCount"); + mod.get_name = + (TestNameFuncPtr)dlsym(mod.hmodule, "OCLTestList_TestName"); + mod.create_test = + (CreateTestFuncPtr)dlsym(mod.hmodule, "OCLTestList_CreateTest"); + mod.destroy_test = + (DestroyTestFuncPtr)dlsym(mod.hmodule, "OCLTestList_DestroyTest"); + mod.get_version = + (TestVersionFuncPtr)dlsym(mod.hmodule, "OCLTestList_TestLibVersion"); + mod.get_libname = + (TestLibNameFuncPtr)dlsym(mod.hmodule, "OCLTestList_TestLibName"); +#endif + mod.cached_test = new OCLTest*[mod.get_count()]; + for (int x = 0, y = mod.get_count(); x < y; ++x) { + mod.cached_test[x] = NULL; + } + m_modules.push_back(mod); + } + } +} + +void App::CleanUp() { + for (unsigned int i = 0; i < m_modules.size(); i++) { + if (m_modules[i].cached_test) { + delete[] m_modules[i].cached_test; + } +#ifdef ATI_OS_WIN + FreeLibrary(m_modules[i].hmodule); +#endif +#ifdef ATI_OS_LINUX + dlclose(m_modules[i].hmodule); +#endif + } + +#ifdef ATI_OS_WIN + if (m_window) delete m_window; + m_window = 0; +#endif +} + +extern int optind; +///////////////////////////////////////////////////////////////////////////// +bool App::m_reRunFailed = false; +bool App::m_svcMsg = false; +int main(int argc, char** argv) { + unsigned int platform = 0; + platform = parseCommandLineForPlatform(argc, argv); + // reset optind as we really didn't parse the full command line + optind = 0; + App app(platform); +#ifdef ATI_OS_WIN + // this function is registers windows service routine when ocltst is launched + // by the OS on service initialization. On other scenarios, this function does + // nothing. + serviceStubCall(); + // SetErrorMode(SEM_NOGPFAULTERRORBOX); + // const LPTOP_LEVEL_EXCEPTION_FILTER oldFilter = + // SetUnhandledExceptionFilter(xFilter); +#endif // ATI_OS_WIN +#ifdef AUTO_REGRESS + try { +#endif /* AUTO_REGRESS */ + app.CommandLine(argc, argv); + app.printOCLinfo(); + app.ScanForTests(); + for (int i = 0; i < app.GetNumItr(); i++) { + app.RunAllTests(); + } + app.CleanUp(); +#ifdef AUTO_REGRESS + } catch (...) { + oclTestLog(OCLTEST_LOG_ALWAYS, "Exiting due to unhandled exception!\n"); + return (-1); + } +#endif /* AUTO_REGRESS */ + + return 0; +} + +#ifdef ATI_OS_WIN + +#include + +typedef unsigned int uint32; +typedef size_t uintp; + +struct StackEntry { + uintp addr; + uint32 line; + uint32 disp; + char symbol[128]; + char file[128]; +}; + +static const unsigned int MAX_DEPTH_PER_NODE = 24; +struct Info { + bool operator==(const Info& b) const { return key == b.key; } + + uintp key; // pointer, handle, whatever + StackEntry stack[MAX_DEPTH_PER_NODE]; +}; + +static void dumpTraceBack(CONTEXT& context) { + Info info; + + oclTestLog(OCLTEST_LOG_ALWAYS, "Exception: exiting!\n"); + HANDLE process = GetCurrentProcess(); + + STACKFRAME64 stackframe; + memset(&stackframe, 0, sizeof(STACKFRAME64)); + +#if defined(_WIN64) + stackframe.AddrPC.Offset = context.Rip; + stackframe.AddrPC.Mode = AddrModeFlat; + stackframe.AddrStack.Offset = context.Rsp; + stackframe.AddrStack.Mode = AddrModeFlat; + stackframe.AddrFrame.Offset = context.Rbp; + stackframe.AddrFrame.Mode = AddrModeFlat; +#else + stackframe.AddrPC.Offset = context.Eip; + stackframe.AddrPC.Mode = AddrModeFlat; + stackframe.AddrStack.Offset = context.Esp; + stackframe.AddrStack.Mode = AddrModeFlat; + stackframe.AddrFrame.Offset = context.Ebp; + stackframe.AddrFrame.Mode = AddrModeFlat; +#endif + unsigned int depth = 0; + + if (SymInitialize(process, NULL, true)) { + while ((depth < MAX_DEPTH_PER_NODE) && + StackWalk64(IMAGE_FILE_MACHINE_I386, process, GetCurrentThread(), + &stackframe, &context, NULL, SymFunctionTableAccess64, + SymGetModuleBase64, NULL)) { + if (stackframe.AddrPC.Offset != 0) { + // + // we don't want to evaluate the names/lines yet + // so just record the address + // + info.stack[depth].addr = (uintp)stackframe.AddrPC.Offset; + + DWORD64 disp64; + DWORD disp; + IMAGEHLP_SYMBOL64* symInfo; + IMAGEHLP_LINE64 lineInfo; + uintp addr = (uintp)stackframe.AddrPC.Offset; + char buffer[128]; + + symInfo = (IMAGEHLP_SYMBOL64*)&buffer[0]; + symInfo->SizeOfStruct = sizeof(symInfo); + symInfo->MaxNameLength = (sizeof(buffer) - sizeof(IMAGEHLP_SYMBOL64)); + + lineInfo.SizeOfStruct = sizeof(lineInfo); + + if (SymGetSymFromAddr64(process, addr, &disp64, symInfo)) { + sprintf(info.stack[depth].symbol, "%s", symInfo->Name); + info.stack[depth].disp = (uint32)disp64; + } else { + sprintf(info.stack[depth].symbol, ""); + } + + if (SymGetLineFromAddr64(process, addr, &disp, &lineInfo)) { + sprintf(info.stack[depth].file, "%s", lineInfo.FileName); + info.stack[depth].line = lineInfo.LineNumber; + } else { + info.stack[depth].file[0] = '\0'; + } + depth++; + } + } + } + + SymCleanup(process); + + int j = 0; + while (j < MAX_DEPTH_PER_NODE && info.stack[j].addr != 0) { + oclTestLog(OCLTEST_LOG_ALWAYS, " %s()+%d (0x%.8x) %s:%d\n", + info.stack[j].symbol, info.stack[j].disp, info.stack[j].addr, + info.stack[j].file, info.stack[j].line); + + j++; + } +} + +static LONG WINAPI xFilter(LPEXCEPTION_POINTERS xEP) { + CONTEXT context; + CONTEXT* xCtx = &context; + memset(xCtx, 0, sizeof(CONTEXT)); + context.ContextFlags = CONTEXT_FULL; + memcpy(xCtx, xEP->ContextRecord, sizeof(CONTEXT)); + + dumpTraceBack(context); + + return (EXCEPTION_EXECUTE_HANDLER); +} +#undef CHECK_RESULT +#endif // WIN_OS + +///////////////////////////////////////////////////////////////////////////// diff --git a/projects/clr/opencl/tests/ocltst/env/pfm.cpp b/projects/clr/opencl/tests/ocltst/env/pfm.cpp new file mode 100644 index 0000000000..4e22fe1d8c --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/env/pfm.cpp @@ -0,0 +1,79 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "pfm.h" + +#ifdef ATI_OS_WIN +#include +#endif + +#include +#include +#include +#include +#include + +unsigned int SavePFM(const char* filename, const float* buffer, + unsigned int width, unsigned int height, + unsigned int components) { + unsigned int error = 0; + + // + // open the image file for writing + // + FILE* fh; + if ((fh = fopen(filename, "wb")) == NULL) { + return 1; + } + + // + // write the PFM header + // +#define PFMEOL "\x0a" + fprintf(fh, "PF" PFMEOL "%d %d" PFMEOL "-1" PFMEOL, width, height); + fflush(fh); + + // + // write each scanline + // + const unsigned int lineSize = width * 3; + float line[3 * 4096]; + for (unsigned int y = height; y > 0; y--) { + const float* v = buffer + components * width * (y - 1); + for (unsigned int x = 0; x < width; x++) { + line[x * 3 + 0] = v[x * components + 0]; + line[x * 3 + 1] = + (components > 1) ? v[x * components + 1] : v[x * components + 0]; + line[x * 3 + 2] = + (components > 2) ? v[x * components + 2] : v[x * components + 0]; + } + unsigned int written = + (unsigned int)fwrite(line, (unsigned int)sizeof(float), lineSize, fh); + if (written != lineSize) { + error = 1; + break; + } + fflush(fh); + } + fflush(fh); + fclose(fh); + + return error; +} diff --git a/projects/clr/opencl/tests/ocltst/env/pfm.h b/projects/clr/opencl/tests/ocltst/env/pfm.h new file mode 100644 index 0000000000..60814d5f4d --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/env/pfm.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _PFM_H_ +#define _PFM_H_ + +extern unsigned int SavePFM(const char* filename, const float* buffer, + unsigned int width, unsigned int height, + unsigned int components); + +#endif // _PFM_H_ diff --git a/projects/clr/opencl/tests/ocltst/include/OCL/Thread.h b/projects/clr/opencl/tests/ocltst/include/OCL/Thread.h new file mode 100644 index 0000000000..47ac2642c3 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/include/OCL/Thread.h @@ -0,0 +1,148 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef OCL_THREAD_H +#define OCL_THREAD_H + +//! +//! \file Thread.h +//! + +#ifdef ATI_OS_WIN +#ifndef _WIN32_WINNT +#define _WIN32_WINNT 0x0501 +#endif + +#include "windows.h" +#else +#include "pthread.h" +#endif + +//! Entry point for the thread +//! prototype of the entry point in windows +typedef void *(*oclThreadFunc)(void *); + +namespace OCLutil { +//! \class Lock +//! \brief Provides a wrapper for locking primitives used to +//! synchronize _CPU_ threads. +//! +//! Common usage would be: +//! +//! OCL::Lock lock; +//! +//! .... +//! +//! // Critical section begins +//! +//! lock.lock(); +//! +//! ..... +//! +//! // Critical section ends +//! +//! lock.unlock(); +//! + +class Lock { + public: + //! Constructor for OCLLock + Lock(); + + //! Destructor for OCLLock + ~Lock(); + + //! Try to acquire the lock, if available continue, else wait on the lock + void lock(); + + //! Try to acquire the lock, if available, hold it, else continue doing + //! something else + bool tryLock(); + + //! Unlock the lock and return + void unlock(); + + private: + ///////////////////////////////////////////////////////////// + //! + //! Private data members and methods + //! + + //! System specific synchronization primitive +#ifdef ATI_OS_WIN + CRITICAL_SECTION _cs; +#else + pthread_mutex_t _lock; +#endif +}; + +////////////////////////////////////////////////////////////// +//! +//! \class Thread +//! \brief Provides a wrapper for creating a _CPU_ thread. +//! +//! This class provides a simple wrapper to a CPU thread/ +//! The class name might be a bit confusing, esp considering +//! the GPU has it's own threads as well. +//! +class Thread { + public: + //! Thread constructor and destructor. Note that the thread is + //! NOT created in the constructor. The thread creation takes + //! place in the create method + Thread(); + + ~Thread(); + + //! Wrapper for pthread_create. Pass the thread's entry + //! point and data to be passed to the routine + bool create(oclThreadFunc func, void *arg); + + //! Wrapper for pthread_join. The calling thread + //! will wait until _this_ thread exits + bool join(); + + //! Get the thread data passed by the application + void *getData() { return _data; } + + //! Get the thread ID + static unsigned int getID(); + + private: + ///////////////////////////////////////////////////////////// + //! + //! Private data members and methods + //! + +#ifdef ATI_OS_WIN + //! store the handle + HANDLE _tid; + + unsigned int _ID; +#else + pthread_t _tid; + + pthread_attr_t _attr; +#endif + + void *_data; +}; +}; // namespace OCLutil +#endif diff --git a/projects/clr/opencl/tests/ocltst/include/OCLLog.h b/projects/clr/opencl/tests/ocltst/include/OCLLog.h new file mode 100644 index 0000000000..6b138eb030 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/include/OCLLog.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef OCLLOG_H_ +#define OCLLOG_H_ + +#ifdef ATI_OS_WIN + +#ifdef OCLTST_LOG_BUILD +#define DLLIMPORT __declspec(dllexport) +#else +#define DLLIMPORT __declspec(dllimport) +#endif // OCLTST_ENV_BUILD + +#else +#define DLLIMPORT + +#endif // ATI_OS_WIN + +enum oclLoggingLevel { + OCLTEST_LOG_ALWAYS, + OCLTEST_LOG_VERBOSE, +}; + +extern DLLIMPORT void oclTestLog(oclLoggingLevel logLevel, const char* fmt, + ...); +extern DLLIMPORT void oclTestSetLogLevel(int level); +extern DLLIMPORT void oclTestEnableLogToFile(const char* filename); + +#endif // OCLLOG_H_ diff --git a/projects/clr/opencl/tests/ocltst/include/OCLTest.h b/projects/clr/opencl/tests/ocltst/include/OCLTest.h new file mode 100644 index 0000000000..7923daccb4 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/include/OCLTest.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCLTEST_H_ +#define _OCLTEST_H_ + +#include + +#include "OCLWrapper.h" + +class BaseTestImp; +class OCLTestImp; +class OCLTest { + public: + virtual unsigned int getThreadUsage(void) = 0; + virtual int getNumSubTests(void) = 0; + virtual void open() = 0; + virtual void open(unsigned int test, const char* deviceName, + unsigned int architecture) = 0; + + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId, unsigned int platformIndex) = 0; + + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) = 0; + + virtual void run(void) = 0; + virtual unsigned int close(void) = 0; + virtual void setErrorMsg(const char* error) = 0; + virtual const char* getErrorMsg(void) = 0; + virtual bool hasErrorOccured(void) = 0; + virtual void clearError() = 0; + virtual void setDeviceId(unsigned int deviceId) = 0; + virtual void setPlatformIndex(unsigned int platformIndex) = 0; + virtual OCLTestImp* toOCLTestImp() = 0; + virtual BaseTestImp* toBaseTestImp() = 0; + virtual float getPerfInfo() = 0; + virtual void clearPerfInfo(void) = 0; + + virtual void setIterationCount(int cnt) = 0; + virtual void useCPU() = 0; + // Having this return true will allow the creation of the + // test to be cached in between runs and will only be + // deleted after all the tests are finished running. + // This defaults to false as not many tests are modified + // to use it. + // FIXME: Switch all tests to support caching. + virtual bool cache_test() { return true; } + + std::string testDescString; + void resetDescString(void) { testDescString.clear(); } + + virtual ~OCLTest(){}; +}; + +#endif // _OCLTEST_H_ diff --git a/projects/clr/opencl/tests/ocltst/include/OCLTestList.h b/projects/clr/opencl/tests/ocltst/include/OCLTestList.h new file mode 100644 index 0000000000..ad39837623 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/include/OCLTestList.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCLMODULE_H_ +#define _OCLMODULE_H_ + +#ifdef ATI_OS_WIN +#define OCLLCONV __cdecl +#endif +#ifdef ATI_OS_LINUX +#define OCLLCONV +#endif + +class OCLTest; + +// +// exported function pointer typedefs +// +typedef unsigned int(OCLLCONV *TestCountFuncPtr)(void); +typedef const char *(OCLLCONV *TestNameFuncPtr)(unsigned int); +typedef OCLTest *(OCLLCONV *CreateTestFuncPtr)(unsigned int); +typedef void(OCLLCONV *DestroyTestFuncPtr)(OCLTest *); +typedef unsigned int(OCLLCONV *TestVersionFuncPtr)(void); +typedef const char *(OCLLCONV *TestLibNameFuncPtr)(void); + +#endif // _OCLMODULE_H_ diff --git a/projects/clr/opencl/tests/ocltst/include/OCLTestUtils.h b/projects/clr/opencl/tests/ocltst/include/OCLTestUtils.h new file mode 100644 index 0000000000..ea1565afc2 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/include/OCLTestUtils.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef OCLTESTUTILS_H_ +#define OCLTESTUTILS_H_ +#include + +// @param FN Name of the file to be loaded +// @param S String to store the loaded file +// @brief Load file to a string +// @return true on success +bool loadFile(const char* FN, std::string& S); + +#endif /* OCLTESTUTILS_H_ */ diff --git a/projects/clr/opencl/tests/ocltst/include/OCLWrapper.h b/projects/clr/opencl/tests/ocltst/include/OCLWrapper.h new file mode 100644 index 0000000000..757dd84000 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/include/OCLWrapper.h @@ -0,0 +1,614 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef __OCLWrapper_H +#define __OCLWrapper_H + +#define CL_USE_DEPRECATED_OPENCL_1_1_APIS +#define CL_USE_DEPRECATED_OPENCL_2_0_APIS +#include "CL/cl.h" +#include "CL/cl_ext.h" +#include "CL/cl_gl.h" +#include "cl_profile_amd.h" + +typedef CL_API_ENTRY cl_int(CL_API_CALL *clUnloadPlatformAMD_fn)( + cl_platform_id id); + +// Function Pointer Declarations for cl_khr_gl_sharing extension (missing in +// cl_gl.h) +typedef CL_API_ENTRY cl_int(CL_API_CALL *clGetGLContextInfoKHR_fn)( + const cl_context_properties *properties, cl_gl_context_info param_name, + size_t param_value_size, void *param_value, size_t *param_value_size_ret); + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *clCreateFromGLBuffer_fn)( + cl_context context, cl_mem_flags flags, unsigned int bufobj, + int *errcode_ret); + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *clCreateFromGLTexture_fn)( + cl_context context, cl_mem_flags flags, unsigned int texture_target, + int miplevel, unsigned int texture, cl_int *errcode_ret); + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *clCreateFromGLTexture2D_fn)( + cl_context context, cl_mem_flags flags, unsigned int texture_target, + int miplevel, unsigned int texture, cl_int *errcode_ret); + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *clCreateFromGLRenderbuffer_fn)( + cl_context context, cl_mem_flags flags, unsigned int renderbuffer, + cl_int *errcode_ret); + +typedef CL_API_ENTRY cl_int(CL_API_CALL *clGetGLObjectInfo_fn)( + cl_mem memobj, cl_gl_object_type *gl_object_type, + unsigned int *gl_object_name); + +typedef CL_API_ENTRY cl_int(CL_API_CALL *clGetGLTextureInfo_fn)( + cl_mem memobj, cl_gl_texture_info param_name, size_t param_value_size, + void *param_value, size_t *param_value_size_ret); + +typedef CL_API_ENTRY cl_int(CL_API_CALL *clEnqueueAcquireGLObjects_fn)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); + +typedef CL_API_ENTRY cl_int(CL_API_CALL *clEnqueueReleaseGLObjects_fn)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); + +// Function Pointer Declarations for performance counters +typedef CL_API_ENTRY cl_perfcounter_amd(CL_API_CALL *clCreatePerfCounterAMD_fn)( + cl_device_id device, cl_perfcounter_property *properties, + cl_int *errcode_ret); + +typedef CL_API_ENTRY cl_int(CL_API_CALL *clEnqueueBeginPerfCounterAMD_fn)( + cl_command_queue command_queue, cl_uint num_perf_counters, + cl_perfcounter_amd *perf_counters, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); + +typedef CL_API_ENTRY cl_int(CL_API_CALL *clEnqueueEndPerfCounterAMD_fn)( + cl_command_queue command_queue, cl_uint num_perf_counters, + cl_perfcounter_amd *perf_counters, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); + +typedef CL_API_ENTRY cl_int(CL_API_CALL *clGetPerfCounterInfoAMD_fn)( + cl_perfcounter_amd perf_counter, cl_perfcounter_info param_name, + size_t param_value_size, void *param_value, size_t *param_value_size_ret); + +typedef CL_API_ENTRY cl_int(CL_API_CALL *clReleasePerfCounterAMD_fn)( + cl_perfcounter_amd perf_counter); + +typedef CL_API_ENTRY cl_int(CL_API_CALL *clRetainPerfCounterAMD_fn)( + cl_perfcounter_amd perf_counter); + +typedef CL_API_ENTRY cl_int(CL_API_CALL *clSetDeviceClockModeAMD_fn)( + cl_device_id device, + cl_set_device_clock_mode_input_amd set_clock_mode_input, + cl_set_device_clock_mode_output_amd *set_clock_mode_Output); + +class OCLWrapper { + public: + OCLWrapper(); + + ~OCLWrapper() {} + + // All OCL APIs are declared in the order they appear in cl.h + + cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, + cl_uint *num_platforms); + + cl_int clGetPlatformInfo(cl_platform_id platform, cl_platform_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + + cl_int clGetDeviceIDs(cl_platform_id platform, cl_device_type device_type, + cl_uint num_entries, cl_device_id *devices, + cl_uint *num_devices); + + cl_int clGetDeviceInfo(cl_device_id device, cl_device_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + + cl_context clCreateContext(cl_context_properties *properties, + cl_uint num_devices, const cl_device_id *devices, + void(CL_CALLBACK *pfn_notify)(const char *, + const void *, size_t, + void *), + void *user_data, cl_int *errcode_ret); + + cl_context clCreateContextFromType( + cl_context_properties *properties, cl_device_type device_type, + void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), + void *user_data, cl_int *errcode_ret); + + cl_int clRetainContext(cl_context context); + + cl_int clReleaseContext(cl_context context); + + cl_int clGetContextInfo(cl_context context, cl_context_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + + cl_command_queue clCreateCommandQueue(cl_context context, cl_device_id device, + cl_command_queue_properties properties, + cl_int *errcode_ret); + + cl_int clRetainCommandQueue(cl_command_queue command_queue); + + cl_int clReleaseCommandQueue(cl_command_queue command_queue); + + cl_int clGetCommandQueueInfo(cl_command_queue command_queue, + cl_command_queue_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + + cl_mem clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, + void *host_ptr, cl_int *errcode_ret); + + cl_mem clCreateImage2D(cl_context context, cl_mem_flags flags, + const cl_image_format *image_format, + size_t image_width, size_t image_height, + size_t image_row_pitch, void *host_ptr, + cl_int *errcode_ret); + + cl_mem clCreateImage3D(cl_context context, cl_mem_flags flags, + const cl_image_format *image_format, + size_t image_width, size_t image_height, + size_t image_depth, size_t image_row_pitch, + size_t image_slice_pitch, void *host_ptr, + cl_int *errcode_ret); + + cl_int clRetainMemObject(cl_mem memobj); + + cl_int clReleaseMemObject(cl_mem memobj); + + cl_int clGetSupportedImageFormats(cl_context context, cl_mem_flags flags, + cl_mem_object_type image_type, + cl_uint num_entries, + cl_image_format *image_formats, + cl_uint *num_image_formats); + + cl_int clGetMemObjectInfo(cl_mem memobj, cl_mem_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + + cl_int clGetImageInfo(cl_mem image, cl_image_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + + cl_sampler clCreateSampler(cl_context context, cl_bool normalized_coords, + cl_addressing_mode addressing_mode, + cl_filter_mode filter_mode, cl_int *errcode_ret); + + cl_int clRetainSampler(cl_sampler sampler); + + cl_int clReleaseSampler(cl_sampler sampler); + + cl_int clGetSamplerInfo(cl_sampler sampler, cl_sampler_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + + cl_program clCreateProgramWithSource(cl_context context, cl_uint count, + const char **strings, + const size_t *lengths, + cl_int *errcode_ret); + + cl_program clCreateProgramWithBinary(cl_context context, cl_uint num_devices, + const cl_device_id *device_list, + const size_t *lengths, + const unsigned char **binaries, + cl_int *binary_status, + cl_int *errcode_ret); + + cl_int clRetainProgram(cl_program program); + + cl_int clReleaseProgram(cl_program program); + + cl_int clBuildProgram(cl_program program, cl_uint num_devices, + const cl_device_id *device_list, const char *options, + void(CL_CALLBACK *pfn_notify)(cl_program program, + void *user_data), + void *user_data); + + cl_int clCompileProgram( + cl_program program, cl_uint num_devices, const cl_device_id *device_list, + const char *options, cl_uint num_input_headers, + const cl_program *input_headers, const char **header_include_names, + void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), + void *user_data); + + cl_program clLinkProgram(cl_context context, cl_uint num_devices, + const cl_device_id *device_list, const char *options, + cl_uint num_input_programs, + const cl_program *input_programs, + void(CL_CALLBACK *pfn_notify)(cl_program program, + void *user_data), + void *user_data, cl_int *errcode_ret); + + cl_int clUnloadCompiler(void); + + cl_int clUnloadPlatform(cl_platform_id); + + cl_int clGetProgramInfo(cl_program program, cl_program_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + + cl_int clGetProgramBuildInfo(cl_program program, cl_device_id device, + cl_program_build_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + + cl_kernel clCreateKernel(cl_program program, const char *kernel_name, + cl_int *errcode_ret); + + cl_int clCreateKernelsInProgram(cl_program program, cl_uint num_kernels, + cl_kernel *kernels, cl_uint *num_kernels_ret); + + cl_int clRetainKernel(cl_kernel kernel); + + cl_int clReleaseKernel(cl_kernel kernel); + + cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, + const void *arg_value); + + cl_int clGetKernelInfo(cl_kernel kernel, cl_kernel_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + + cl_int clGetKernelWorkGroupInfo(cl_kernel kernel, cl_device_id device, + cl_kernel_work_group_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + + cl_int clWaitForEvents(cl_uint num_events, const cl_event *event_list); + + cl_int clGetEventInfo(cl_event evnt, cl_event_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + + cl_int clRetainEvent(cl_event evnt); + + cl_int clReleaseEvent(cl_event evnt); + + cl_int clGetEventProfilingInfo(cl_event evnt, cl_profiling_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + + cl_int clFlush(cl_command_queue command_queue); + + cl_int clFinish(cl_command_queue command_queue); + + cl_int clEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer, + cl_bool blocking_read, size_t offset, size_t cb, + void *ptr, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt); + + cl_int clEnqueueWriteBuffer(cl_command_queue command_queue, cl_mem buffer, + cl_bool blocking_write, size_t offset, size_t cb, + const void *ptr, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt); + + cl_int clEnqueueCopyBuffer(cl_command_queue command_queue, cl_mem src_buffer, + cl_mem dst_buffer, size_t src_offset, + size_t dst_offset, size_t cb, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt); + + cl_int clEnqueueReadImage(cl_command_queue command_queue, cl_mem image, + cl_bool blocking_read, const size_t *origin, + const size_t *region, size_t row_pitch, + size_t slice_pitch, void *ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt); + + cl_int clEnqueueWriteImage(cl_command_queue command_queue, cl_mem image, + cl_bool blocking_write, const size_t *origin, + const size_t *region, size_t input_row_pitch, + size_t input_slice_pitch, const void *ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt); + + cl_int clEnqueueCopyImage(cl_command_queue command_queue, cl_mem src_image, + cl_mem dst_image, const size_t *src_origin, + const size_t *dst_origin, const size_t *region, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt); + + cl_int clEnqueueCopyImageToBuffer(cl_command_queue command_queue, + cl_mem src_image, cl_mem dst_buffer, + const size_t *src_origin, + const size_t *region, size_t dst_offset, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *evnt); + + cl_int clEnqueueCopyBufferToImage(cl_command_queue command_queue, + cl_mem src_buffer, cl_mem dst_image, + size_t src_offset, const size_t *dst_origin, + const size_t *region, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *evnt); + + void *clEnqueueMapBuffer(cl_command_queue command_queue, cl_mem buffer, + cl_bool blocking_map, cl_map_flags map_flags, + size_t offset, size_t cb, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt, + cl_int *errcode_ret); + + void *clEnqueueMapImage(cl_command_queue command_queue, cl_mem image, + cl_bool blocking_map, cl_map_flags map_flags, + const size_t *origin, const size_t *region, + size_t *image_row_pitch, size_t *image_slice_pitch, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt, + cl_int *errcode_ret); + + cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue, cl_mem memobj, + void *mapped_ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *evnt); + + cl_int clEnqueueNDRangeKernel( + cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, + const size_t *global_work_offset, const size_t *global_work_size, + const size_t *local_work_size, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt); + + cl_int clEnqueueTask(cl_command_queue command_queue, cl_kernel kernel, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt); + + cl_int clEnqueueNativeKernel(cl_command_queue command_queue, + void(CL_CALLBACK *user_func)(void *), void *args, + size_t cb_args, cl_uint num_mem_objects, + const cl_mem *mem_list, + const void **args_mem_loc, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt); + + cl_int clEnqueueMarker(cl_command_queue command_queue, cl_event *evnt); + + cl_int clEnqueueMarkerWithWaitList(cl_command_queue command_queue, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *evnt); + + cl_int clEnqueueWaitForEvents(cl_command_queue command_queue, + cl_uint num_events, const cl_event *event_list); + + cl_int clEnqueueBarrier(cl_command_queue command_queue); + + void *clGetExtensionFunctionAddress(const char *func_name); + + cl_int clEnqueueReadBufferRect( + cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, + const size_t *buffer_origin, const size_t *host_origin, + const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch, + size_t host_row_pitch, size_t host_slice_pitch, void *ptr, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *evnt); + + cl_int clEnqueueWriteBufferRect( + cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, + const size_t *buffer_origin, const size_t *host_origin, + const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch, + size_t host_row_pitch, size_t host_slice_pitch, const void *ptr, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *evnt); + + cl_int clEnqueueCopyBufferRect( + cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, + const size_t *src_origin, const size_t *dst_origin, const size_t *region, + size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, + size_t dst_slice_pitch, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt); + + cl_mem clCreateImage(cl_context context, cl_mem_flags flags, + const cl_image_format *image_format, + const cl_image_desc *image_desc, void *host_ptr, + cl_int *errcode_ret); + + cl_mem clCreateSubBuffer(cl_mem mem, cl_mem_flags flags, + cl_buffer_create_type buffer_create_type, + const void *buffer_create_info, cl_int *errcode_ret); + + cl_int clSetEventCallback( + cl_event event, cl_int command_exec_callback_type, + void(CL_CALLBACK *pfn_event_notify)(cl_event event, + cl_int event_command_exec_status, + void *user_data), + void *user_data); + + cl_int clEnqueueFillImage(cl_command_queue command_queue, cl_mem image, + void *ptr, const size_t *origin, + const size_t *region, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt); + + cl_int clUnloadPlatformAMD(cl_platform_id id); + + cl_int clEnqueueWaitSignalAMD(cl_command_queue command_queue, + cl_mem mem_object, cl_uint value, + cl_uint num_events, + const cl_event *event_wait_list, + cl_event *event); + + cl_int clEnqueueWriteSignalAMD(cl_command_queue command_queue, + cl_mem mem_object, cl_uint value, + cl_ulong offset, cl_uint num_events, + const cl_event *event_list, cl_event *event); + + cl_int clEnqueueMakeBuffersResidentAMD( + cl_command_queue command_queue, cl_uint num_mem_objs, cl_mem *mem_objects, + cl_bool blocking_make_resident, cl_bus_address_amd *bus_addresses, + cl_uint num_events, const cl_event *event_list, cl_event *event); + + cl_int clEnqueueMigrateMemObjects(cl_command_queue command_queue, + cl_uint num_mem_objects, + const cl_mem *mem_objects, + cl_mem_migration_flags flags, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event); + + // CL-GL Extension: cl_khr_gl_sharing + cl_int clGetGLContextInfoKHR(const cl_context_properties *properties, + cl_gl_context_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + + cl_mem clCreateFromGLBuffer(cl_context context, cl_mem_flags flags, + unsigned int bufobj, int *errcode_ret); + + cl_mem clCreateFromGLTexture(cl_context context, cl_mem_flags flags, + unsigned int texture_target, int miplevel, + unsigned int texture, cl_int *errcode_ret); + + cl_mem clCreateFromGLTexture2D(cl_context context, cl_mem_flags flags, + unsigned int texture_target, int miplevel, + unsigned int texture, cl_int *errcode_ret); + + cl_mem clCreateFromGLRenderbuffer(cl_context context, cl_mem_flags flags, + unsigned int renderbuffer, + cl_int *errcode_ret); + + cl_int clGetGLObjectInfo(cl_mem memobj, cl_gl_object_type *gl_object_type, + unsigned int *gl_object_name); + + cl_int clGetGLTextureInfo(cl_mem memobj, cl_gl_texture_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + + cl_int clEnqueueAcquireGLObjects(cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem *mem_objects, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event); + + cl_int clEnqueueReleaseGLObjects(cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem *mem_objects, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event); + +#if defined(CL_VERSION_2_0) + cl_command_queue clCreateCommandQueueWithProperties( + cl_context context, cl_device_id device, + const cl_queue_properties *properties, cl_int *errcode_ret); + + void *clSVMAlloc(cl_context context, cl_svm_mem_flags flags, size_t size, + cl_uint alignment); + + void clSVMFree(cl_context context, void *svm_pointer); + + cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, + cl_map_flags flags, void *svm_ptr, size_t size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); + + cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, void *svm_ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); + + cl_int clEnqueueSVMMemFill(cl_command_queue command_queue, void *svm_ptr, + const void *pattern, size_t pattern_size, + size_t size, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); + + cl_int clSetKernelArgSVMPointer(cl_kernel kernel, cl_uint arg_index, + const void *arg_value); + + cl_mem clCreatePipe(cl_context context, cl_mem_flags flags, + cl_uint packet_size, cl_uint num_packets, + const cl_pipe_properties *properties, + cl_int *errcode_ret); + + cl_int clGetPipeInfo(cl_mem pipe, cl_pipe_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + +#endif + + cl_perfcounter_amd clCreatePerfCounterAMD(cl_device_id device, + cl_perfcounter_property *properties, + cl_int *errcode_ret); + + cl_int clEnqueueBeginPerfCounterAMD(cl_command_queue command_queue, + cl_uint num_perf_counters, + cl_perfcounter_amd *perf_counters, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event); + + cl_int clEnqueueEndPerfCounterAMD(cl_command_queue command_queue, + cl_uint num_perf_counters, + cl_perfcounter_amd *perf_counters, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event); + + cl_int clGetPerfCounterInfoAMD(cl_perfcounter_amd perf_counter, + cl_perfcounter_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); + + cl_int clReleasePerfCounterAMD(cl_perfcounter_amd perf_counter); + + cl_int clRetainPerfCounterAMD(cl_perfcounter_amd perf_counter); + + cl_int clSetDeviceClockModeAMD( + cl_device_id device, + cl_set_device_clock_mode_input_amd set_clock_mode_input, + cl_set_device_clock_mode_output_amd *set_clock_mode_Output); + + private: + clEnqueueWaitSignalAMD_fn clEnqueueWaitSignalAMD_ptr; + clEnqueueWriteSignalAMD_fn clEnqueueWriteSignalAMD_ptr; + clEnqueueMakeBuffersResidentAMD_fn clEnqueueMakeBuffersResidentAMD_ptr; + + // Unload the platform + clUnloadPlatformAMD_fn clUnloadPlatformAMD_ptr; + + // CL-GL Extension: cl_khr_gl_sharing + clGetGLContextInfoKHR_fn clGetGLContextInfoKHR_ptr; + clCreateFromGLBuffer_fn clCreateFromGLBuffer_ptr; + clCreateFromGLTexture_fn clCreateFromGLTexture_ptr; + clCreateFromGLTexture2D_fn clCreateFromGLTexture2D_ptr; + clCreateFromGLRenderbuffer_fn clCreateFromGLRenderbuffer_ptr; + clGetGLObjectInfo_fn clGetGLObjectInfo_ptr; + clGetGLTextureInfo_fn clGetGLTextureInfo_ptr; + clEnqueueAcquireGLObjects_fn clEnqueueAcquireGLObjects_ptr; + clEnqueueReleaseGLObjects_fn clEnqueueReleaseGLObjects_ptr; + + // Performance counters + clCreatePerfCounterAMD_fn clCreatePerfCounterAMD_ptr; + clEnqueueBeginPerfCounterAMD_fn clEnqueueBeginPerfCounterAMD_ptr; + clEnqueueEndPerfCounterAMD_fn clEnqueueEndPerfCounterAMD_ptr; + clGetPerfCounterInfoAMD_fn clGetPerfCounterInfoAMD_ptr; + clReleasePerfCounterAMD_fn clReleasePerfCounterAMD_ptr; + clRetainPerfCounterAMD_fn clRetainPerfCounterAMD_ptr; + // Set clockMode + clSetDeviceClockModeAMD_fn clSetDeviceClockModeAMD_ptr; +}; + +#endif diff --git a/projects/clr/opencl/tests/ocltst/log/oclTestLog.cpp b/projects/clr/opencl/tests/ocltst/log/oclTestLog.cpp new file mode 100644 index 0000000000..519833fd98 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/log/oclTestLog.cpp @@ -0,0 +1,104 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "oclTestLog.h" + +#include +#include + +#include "OCLLog.h" + +oclLog::oclLog() + : m_stdout_fp(stdout), m_filename(""), m_writeToFileIsEnabled(false) {} + +oclLog::~oclLog() { disable_write_to_file(); } + +void oclLog::enable_write_to_file(std::string filename) { + m_writeToFileIsEnabled = true; + m_filename = filename; + FILE* fp = fopen(m_filename.c_str(), "w"); + if (fp == NULL) { + oclTestLog(OCLTEST_LOG_ALWAYS, + "ERROR: Cannot open file %s. Disabling logging to file.\n", + filename.c_str()); + m_writeToFileIsEnabled = false; + } else { + fclose(fp); + } +} + +void oclLog::disable_write_to_file() { m_writeToFileIsEnabled = false; } + +void oclLog::vprint(char const* fmt, va_list args) { + // hack for fixing the lnx64bit segfault and + // garbage printing in file. XXX 2048 a magic number + char buffer[4096]; + + memset(buffer, 0, sizeof(buffer)); + int rc = vsnprintf(buffer, sizeof(buffer), fmt, args); + assert(rc >= 0 && rc != sizeof(buffer)); + + fputs(buffer, m_stdout_fp); + if (m_writeToFileIsEnabled) { + FILE* fp = fopen(m_filename.c_str(), "a"); + if (fp == NULL) { + oclTestLog(OCLTEST_LOG_ALWAYS, + "ERROR: Cannot open file %s. Disabling logging to file.\n", + m_filename.c_str()); + m_writeToFileIsEnabled = false; + } + fputs(buffer, fp); + fclose(fp); + } +} + +void oclLog::flush() { fflush(m_stdout_fp); } + +static oclLog& theLog() { + static oclLog Log; + return Log; +} + +static oclLoggingLevel currentLevel = OCLTEST_LOG_ALWAYS; +static float logcount = 0.0f; + +void oclTestLog(oclLoggingLevel logLevel, const char* fmt, ...) { + logcount += 1.0f; + + if (logLevel <= currentLevel) { + va_list args; + va_start(args, fmt); + + theLog().vprint(fmt, args); + theLog().flush(); + + va_end(args); + } +} + +void oclTestEnableLogToFile(const char* filename) { + theLog().enable_write_to_file(filename); +} + +void oclTestSetLogLevel(int level) { + if (level >= 0) { + currentLevel = static_cast(level); + } +} diff --git a/projects/clr/opencl/tests/ocltst/log/oclTestLog.h b/projects/clr/opencl/tests/ocltst/log/oclTestLog.h new file mode 100644 index 0000000000..28953941ce --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/log/oclTestLog.h @@ -0,0 +1,44 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef CALTESTLOG_H_ +#define CALTESTLOG_H_ + +#include +#include + +#include + +class oclLog { + public: + oclLog(); + virtual ~oclLog(); + virtual void vprint(char const* fmt, va_list args); + virtual void flush(); + virtual void enable_write_to_file(std::string filename); + virtual void disable_write_to_file(); + + private: + FILE* m_stdout_fp; + std::string m_filename; + bool m_writeToFileIsEnabled; +}; + +#endif // CALTESTLOG_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/common/BaseTestImp.cpp b/projects/clr/opencl/tests/ocltst/module/common/BaseTestImp.cpp new file mode 100644 index 0000000000..5aa6ce2b34 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/common/BaseTestImp.cpp @@ -0,0 +1,185 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "BaseTestImp.h" + +#include +#include +#include + +#include + +///////////////////////////////////////////////////////////////////////////// + +static unsigned int crcinit(unsigned int crc); +static int initializeSeed(void); + +///////////////////////////////////////////////////////////////////////////// + +BaseTestImp::BaseTestImp() + : _numSubTests(0), _openTest(0), _deviceName(NULL), _architecture(0) { + _cpu = false; + unsigned int i; + for (i = 0; i < 256; i++) { + _crctab[i] = crcinit(i << 24); + } + _crcword = ~0; + _deviceId = 0; + _platformIndex = 0; + _perfInfo = 0.0f; + +#ifdef ATI_OS_LINUX // + _useThreads = 0; // disable threads on linux +#else + _useThreads = 1; // if available on platform +#endif + + clearError(); +} + +void BaseTestImp::checkComplib(unsigned int test, const char *deviceName, + unsigned int architecture) { + BaseTestImp::open(); + devices_ = 0; + deviceCount_ = 0; + context_ = 0; + program_ = 0; + kernel_ = 0; + type_ = CL_DEVICE_TYPE_GPU; + + cl_uint numPlatforms = 0; + error_ = clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT((error_ != CL_SUCCESS), "clGetPlatformIDs failed"); + CHECK_RESULT((numPlatforms == 0), "No platform found"); + + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + + cl_platform_id platform = 0; +#if 0 + for(unsigned int i = 0; i < numPlatforms; ++i) + { + char buff[200]; + error_ = clGetPlatformInfo(platforms[i],CL_PLATFORM_VENDOR, sizeof(buff), buff, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed"); + if(strcmp(buff, "Advanced Micro Devices, Inc.") == 0) + { + platform = platforms[i]; + break; + } + } +#endif + platform = platforms[_platformIndex]; + + delete[] platforms; + + CHECK_RESULT((platform == 0), "AMD Platform not found"); + + error_ = clGetDeviceIDs(platform, type_, 0, NULL, &deviceCount_); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed"); + + devices_ = new cl_device_id[deviceCount_]; + error_ = clGetDeviceIDs(platform, type_, deviceCount_, devices_, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed"); + + char device_string[200]; + clGetDeviceInfo(devices_[_deviceId], CL_DRIVER_VERSION, sizeof(device_string), + &device_string, NULL); + if (strstr(device_string, "LC")) { + printf("Skipping test since it does not run with LC\n"); + failed_ = true; + return; + } + return; +} + +BaseTestImp::~BaseTestImp() {} + +void BaseTestImp::open() { + _crcword = 0; + clearError(); +} +void BaseTestImp::open(unsigned int test, const char *deviceName, + unsigned int architecture) { + open(); +} + +unsigned int BaseTestImp::close() { return _crcword; } + +unsigned int BaseTestImp::getThreadUsage(void) { return _useThreads; } + +int BaseTestImp::getNumSubTests(void) { return _numSubTests; } + +void BaseTestImp::setDeviceName(const char *name) { _deviceName = name; } + +const char *BaseTestImp::getDeviceName() { return _deviceName; } + +float BaseTestImp::getPerfInfo(void) { return _perfInfo; } + +void BaseTestImp::clearPerfInfo(void) { _perfInfo = 0.0; } + +void BaseTestImp::setDeviceId(unsigned int deviceId) { _deviceId = deviceId; } + +void BaseTestImp::setIterationCount(int cnt) { _iterationCnt = cnt; } + +unsigned int BaseTestImp::getDeviceId() { return _deviceId; } + +void BaseTestImp::setPlatformIndex(unsigned int platformIndex) { + _platformIndex = platformIndex; +} + +unsigned int BaseTestImp::getPlatformIndex() { return _platformIndex; } + +void BaseTestImp::setErrorMsg(const char *error) { + _errorFlag = true; + _errorMsg.assign((const char *)error); +} + +const char *BaseTestImp::getErrorMsg() { return _errorMsg.c_str(); } + +bool BaseTestImp::hasErrorOccured() { return _errorFlag; } + +void BaseTestImp::clearError() { + _errorFlag = false; + _errorMsg.clear(); +} + +///////////////////////////////////////////////////////////////////////////// + +///////////////////////////////////////////////////////////////////////////// +// +// Same CRC32 as used by ogtst +// +static const unsigned int CRCMASK = 0x04c11db7; + +static unsigned int crcinit(unsigned int crc) { + int i; + unsigned int ans = crc; + + for (i = 0; i < 8; i++) { + if (ans & 0x80000000) { + ans = (ans << 1) ^ CRCMASK; + } else { + ans <<= 1; + } + } + return (ans); +} diff --git a/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommon.cpp b/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommon.cpp new file mode 100644 index 0000000000..4cf7aa3289 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommon.cpp @@ -0,0 +1,175 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLGLCommon.h" + +#include +#include + +void OCLGLCommon::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + // OpenCL Initialization + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test (%d)", error_); + + char name[1024] = {0}; + size_t size = 0; + + if (deviceId >= deviceCount_) { + _errorFlag = true; + return; + } + + // Check that the device supports CL/GL interop extension + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 1024, + name, &size); + if (!strstr(name, "cl_khr_gl_sharing")) { + printf("KHR GL sharing extension is required for this test!\n"); + _errorFlag = true; + return; + } + + // OpenGL Initialization + bool retVal = initializeGLContext(hGL_); + CHECK_RESULT((retVal == CL_SUCCESS), "Error opening test (%d)", error_); + + createCLContextFromGLContext(hGL_); +} + +bool OCLGLCommon::IsGLEnabled(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + bool bResult = initializeGLContext(hGL_); + if (bResult) { + deleteGLContext(hGL_); + } + OCLTestImp::close(); + return bResult; +} + +void OCLGLCommon::gluPerspective(double fovy, double aspect, double zNear, + double zFar) { + double xmin, xmax, ymin, ymax; + ymax = zNear * tan(fovy * 3.149 / 360.0); + ymin = -ymax; + xmin = ymin * aspect; + xmax = ymax * aspect; + glFrustum(xmin, xmax, ymin, ymax, zNear, zFar); +} + +unsigned int OCLGLCommon::close(void) { + makeCurrent(hGL_); + unsigned int retVal = OCLTestImp::close(); + deleteGLContext(hGL_); + return retVal; +} + +void OCLGLCommon::dumpBuffer(float *pBuffer, const char fileName[], + unsigned int dimSize) { + if (pBuffer) { + FILE *f = fopen(fileName, "w"); + if (NULL != f) { + unsigned int i, j; + for (i = 0; i < dimSize; i++) { + for (j = 0; j < dimSize; j++) { + fprintf(f, "%e,\t", pBuffer[i * (dimSize) + j]); + } + fprintf(f, "\n"); + } + fclose(f); + } + } +} + +bool OCLGLCommon::createGLFragmentProgramFromSource(const char *source, + GLuint &shader, + GLuint &program) { + shader = glCreateShader(GL_FRAGMENT_SHADER); + glShaderSource(shader, 1, &source, NULL); + glCompileShader(shader); + printShaderInfoLog(shader); + program = glCreateProgram(); + glAttachShader(program, shader); + glLinkProgram(program); + printProgramInfoLog(program); + + return program != 0; +} + +int OCLGLCommon::printOglError(char *file, int line) { + // + // Returns 1 if an OpenGL error occurred, 0 otherwise. + // + GLenum glErr; + int retCode = 0; + + glErr = glGetError(); + if (glErr != GL_NO_ERROR) { + printf("glError in file %s @ line %d: %d\n", file, line, glErr); + retCode = 1; + } + return retCode; +} + +// +// Print out the information log for a shader object +// +void OCLGLCommon::printShaderInfoLog(GLuint shader) { + int infologLength = 0; + int charsWritten = 0; + GLchar *infoLog; + + glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &infologLength); + + if (infologLength > 0) { + infoLog = (GLchar *)malloc(infologLength); + if (infoLog == NULL) { + printf("ERROR: Could not allocate InfoLog buffer\n"); + return; + } + glGetShaderInfoLog(shader, infologLength, &charsWritten, infoLog); + printf("Shader InfoLog:\n%s\n\n", infoLog); + free(infoLog); + } +} + +void OCLGLCommon::printProgramInfoLog(GLuint program) { + int infologLength = 0; + int charsWritten = 0; + GLchar *infoLog; + + // printOpenGLError(); // Check for OpenGL errors + + glGetProgramiv(program, GL_INFO_LOG_LENGTH, &infologLength); + + // printOpenGLError(); // Check for OpenGL errors + + if (infologLength > 0) { + infoLog = (GLchar *)malloc(infologLength); + if (infoLog == NULL) { + printf("ERROR: Could not allocate InfoLog buffer\n"); + exit(1); + } + glGetProgramInfoLog(program, infologLength, &charsWritten, infoLog); + printf("Program InfoLog:\n%s\n\n", infoLog); + free(infoLog); + } + // printOpenGLError(); // Check for OpenGL errors +} diff --git a/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommon.h b/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommon.h new file mode 100644 index 0000000000..003267952d --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommon.h @@ -0,0 +1,80 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_GL_COMMON_H_ +#define _OCL_GL_COMMON_H_ + +#include +#include +#include + +#include +#include + +#include "OCLTestImp.h" + +typedef struct OCLGLHandle_* OCLGLHandle; + +#define printOpenGLError() OCLGLCommon::printOglError(__FILE__, __LINE__) + +class OCLGLCommon : public OCLTestImp { + public: + ///////////////////////////////////////// + // private initialization and clean-up // + ///////////////////////////////////////// + OCLGLCommon(); + virtual ~OCLGLCommon(); + /////////////////////// + // virtual interface // + /////////////////////// + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId); + virtual unsigned int close(void); + static void gluPerspective(double fovy, double aspect, double zNear, + double zFar); + static void dumpBuffer(float* pBuffer, const char fileName[], + unsigned int dimSize); + static int printOglError(char* file, int line); + static bool createGLFragmentProgramFromSource(const char* source, + GLuint& shader, + GLuint& program); + static void printShaderInfoLog(GLuint shader); + static void printProgramInfoLog(GLuint program); + + protected: + const OCLGLHandle getGLHandle() { return hGL_; } + void makeCurrent(const OCLGLHandle hGL); + void getCLContextPropertiesFromGLContext(const OCLGLHandle hGL, + cl_context_properties properties[7]); + bool createGLContext(OCLGLHandle& hGL); + void destroyGLContext(OCLGLHandle& hGL); + bool IsGLEnabled(unsigned int test, char* units, double& conversion, + unsigned int deviceId); + + private: + bool initializeGLContext(OCLGLHandle& hGL); + void deleteGLContext(OCLGLHandle& hGL); + bool checkAssociationDeviceWithGLContext(OCLGLHandle& hGL); + void createCLContextFromGLContext(OCLGLHandle& hGL); + + OCLGLHandle hGL_; +}; + +#endif // _OCL_GL_COMMON_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommonLinux.cpp b/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommonLinux.cpp new file mode 100644 index 0000000000..4d445d1442 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommonLinux.cpp @@ -0,0 +1,239 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLGLCommon.h" + +struct OCLGLHandle_ { + static Display* display; + static XVisualInfo* vInfo; + static int referenceCount; + GLXContext context; + Window window; + Colormap cmap; +}; + +Display* OCLGLHandle_::display = NULL; +XVisualInfo* OCLGLHandle_::vInfo = NULL; +int OCLGLHandle_::referenceCount = 0; + +OCLGLCommon::OCLGLCommon() { + hGL_ = new OCLGLHandle_; + + hGL_->context = NULL; + hGL_->window = 0; + hGL_->cmap = 0; +} + +OCLGLCommon::~OCLGLCommon() { destroyGLContext(hGL_); } + +void OCLGLCommon::destroyGLContext(OCLGLHandle& hGL) { + deleteGLContext(hGL); + delete hGL; + hGL = NULL; +} + +void OCLGLCommon::deleteGLContext(OCLGLHandle& hGL) { + if (hGL->display != NULL) { + glXMakeCurrent(hGL->display, None, NULL); + if (hGL->cmap) { + XFreeColormap(hGL->display, hGL->cmap); + hGL->cmap = 0; + } + if (hGL->window) { + XDestroyWindow(hGL->display, hGL->window); + hGL->window = 0; + } + if (hGL->context) { + glXDestroyContext(hGL->display, hGL->context); + hGL->context = NULL; + } + + hGL->referenceCount--; + if (hGL->referenceCount == 0) { + XCloseDisplay(hGL->display); + hGL->display = NULL; + + XFree(hGL->vInfo); + hGL->vInfo = NULL; + } + } +} + +bool OCLGLCommon::createGLContext(OCLGLHandle& hGL) { + hGL = new OCLGLHandle_; + return initializeGLContext(hGL); +} + +bool OCLGLCommon::initializeGLContext(OCLGLHandle& hGL) { + if (hGL->display == NULL) { + hGL->display = XOpenDisplay(NULL); + if (hGL->display == NULL) { + printf("XOpenDisplay() failed\n"); + return false; + } + } + if (hGL->vInfo == NULL) { + int dblBuf[] = {GLX_RGBA, GLX_RED_SIZE, 1, GLX_GREEN_SIZE, + 1, GLX_BLUE_SIZE, 1, GLX_DEPTH_SIZE, + 12, GLX_DOUBLEBUFFER, None}; + + hGL->vInfo = + glXChooseVisual(hGL->display, DefaultScreen(hGL->display), dblBuf); + if (hGL->vInfo == NULL) { + printf("glXChooseVisual() failed\n"); + return false; + } + } + hGL->referenceCount++; + + hGL->context = glXCreateContext(hGL->display, hGL->vInfo, None, True); + if (hGL->context == NULL) { + printf("glXCreateContext() failed\n"); + return false; + } + + XSetWindowAttributes swa = {0}; + hGL->cmap = XCreateColormap(hGL->display, + RootWindow(hGL->display, hGL->vInfo->screen), + hGL->vInfo->visual, AllocNone); + swa.colormap = hGL->cmap; + hGL->window = XCreateWindow( + hGL->display, RootWindow(hGL->display, hGL->vInfo->screen), 0, 0, 640, + 480, 0, hGL->vInfo->depth, InputOutput, hGL->vInfo->visual, + CWBorderPixel | CWColormap | CWEventMask, &swa); + + Bool glErr = glXMakeCurrent(hGL->display, hGL->window, hGL->context); + if (False == glErr) { + return false; + } + + if (!checkAssociationDeviceWithGLContext(hGL)) { + deleteGLContext(hGL); + return false; + } + return true; +} + +bool OCLGLCommon::checkAssociationDeviceWithGLContext(OCLGLHandle& hGL) { + bool ret = false; + size_t devicesSize = 0; + cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)platform_, + CL_GL_CONTEXT_KHR, + (cl_context_properties)hGL->context, + CL_GLX_DISPLAY_KHR, + (cl_context_properties)hGL->display, + 0}; + + error_ = _wrapper->clGetGLContextInfoKHR( + properties, CL_DEVICES_FOR_GL_CONTEXT_KHR, 0, NULL, &devicesSize); + if (error_ != CL_SUCCESS) { + printf("clGetGLContextInfoKHR failed (%d)\n", error_); + return false; + } + + cl_uint numDevices = (cl_uint)devicesSize / sizeof(cl_device_id); + cl_device_id* interopDevices = (cl_device_id*)malloc(devicesSize); + + error_ = + _wrapper->clGetGLContextInfoKHR(properties, CL_DEVICES_FOR_GL_CONTEXT_KHR, + devicesSize, interopDevices, NULL); + if (error_ != CL_SUCCESS) { + printf("clGetGLContextInfoKHR failed (%d)\n", error_); + free(interopDevices); + return false; + } + + // Check that current device can be associated with OpenGL context + for (unsigned int i = 0; i < numDevices; i++) { + if (interopDevices[i] == devices_[_deviceId]) { + ret = true; + break; + } + } + + free(interopDevices); + return ret; +} + +void OCLGLCommon::createCLContextFromGLContext(OCLGLHandle& hGL) { + cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)platform_, + CL_GL_CONTEXT_KHR, + (cl_context_properties)hGL->context, + CL_GLX_DISPLAY_KHR, + (cl_context_properties)hGL->display, + 0}; + + // Release current command queue + if (cmdQueues_[_deviceId]) { + error_ = _wrapper->clReleaseCommandQueue(cmdQueues_[_deviceId]); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), + "clReleaseCommandQueue() failed"); + } + + // Release current context + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseContext() failed"); + } + + // Create new CL context from GL context + context_ = + clCreateContext(properties, 1, &devices_[_deviceId], NULL, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext() failed (%d)", error_); + + // Create command queue for new context + cmdQueues_[_deviceId] = + _wrapper->clCreateCommandQueue(context_, devices_[_deviceId], 0, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed (%d)", + error_); + + // GLEW versions 1.13.0 and earlier do not fetch all GL function pointers + // without glewExperimental set. + glewExperimental = GL_TRUE; + GLenum glErr = glewInit(); + CHECK_RESULT((glErr != GLEW_OK), "glewInit() failed: %s", + glewGetErrorString(glErr)); +} + +void OCLGLCommon::makeCurrent(OCLGLHandle hGL) { + if (hGL == NULL) { + if (hGL_ != NULL) { + glXMakeCurrent(hGL_->display, None, NULL); + } + } else { + bool ret = glXMakeCurrent(hGL->display, hGL->window, hGL->context); + assert(ret && "glXMakeCurrent failed!"); + } +} + +void OCLGLCommon::getCLContextPropertiesFromGLContext( + const OCLGLHandle hGL, cl_context_properties properties[7]) { + if (!properties) return; + + properties[0] = CL_CONTEXT_PLATFORM; + properties[1] = (cl_context_properties)platform_; + properties[2] = CL_GL_CONTEXT_KHR; + properties[3] = (cl_context_properties)hGL->context; + properties[4] = CL_GLX_DISPLAY_KHR; + properties[5] = (cl_context_properties)hGL->display; + properties[6] = 0; +} diff --git a/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommonWindows.cpp b/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommonWindows.cpp new file mode 100644 index 0000000000..4a08bd6268 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/common/OCLGLCommonWindows.cpp @@ -0,0 +1,239 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLGLCommon.h" + +struct OCLGLHandle_ { + HDC hdc; + HGLRC hglrc; +}; + +OCLGLCommon::OCLGLCommon() { + hGL_ = new OCLGLHandle_; + + hGL_->hdc = NULL; + hGL_->hglrc = NULL; +} + +OCLGLCommon::~OCLGLCommon() { destroyGLContext(hGL_); } + +void OCLGLCommon::destroyGLContext(OCLGLHandle& hGL) { + deleteGLContext(hGL); + delete hGL; + hGL = NULL; +} + +void OCLGLCommon::deleteGLContext(OCLGLHandle& hGL) { + wglMakeCurrent(NULL, NULL); + if (hGL->hglrc) { + wglDeleteContext(hGL->hglrc); + hGL->hglrc = NULL; + } + if (hGL->hdc) { + DeleteDC(hGL->hdc); + hGL->hdc = NULL; + } +} + +bool OCLGLCommon::createGLContext(OCLGLHandle& hGL) { + hGL = new OCLGLHandle_; + return initializeGLContext(hGL); +} + +bool OCLGLCommon::initializeGLContext(OCLGLHandle& hGL) { + BOOL glErr = FALSE; + DISPLAY_DEVICE dispDevice; + DWORD deviceNum; + int pfmt; + PIXELFORMATDESCRIPTOR pfd; + pfd.nSize = sizeof(PIXELFORMATDESCRIPTOR); + pfd.nVersion = 1; + pfd.dwFlags = PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER; + pfd.iPixelType = PFD_TYPE_RGBA; + pfd.cColorBits = 24; + pfd.cRedBits = 8; + pfd.cRedShift = 0; + pfd.cGreenBits = 8; + pfd.cGreenShift = 0; + pfd.cBlueBits = 8; + pfd.cBlueShift = 0; + pfd.cAlphaBits = 8; + pfd.cAlphaShift = 0; + pfd.cAccumBits = 0; + pfd.cAccumRedBits = 0; + pfd.cAccumGreenBits = 0; + pfd.cAccumBlueBits = 0; + pfd.cAccumAlphaBits = 0; + pfd.cDepthBits = 24; + pfd.cStencilBits = 8; + pfd.cAuxBuffers = 0; + pfd.iLayerType = PFD_MAIN_PLANE; + pfd.bReserved = 0; + pfd.dwLayerMask = 0; + pfd.dwVisibleMask = 0; + pfd.dwDamageMask = 0; + + dispDevice.cb = sizeof(DISPLAY_DEVICE); + for (deviceNum = 0; EnumDisplayDevices(NULL, deviceNum, &dispDevice, 0); + deviceNum++) { + if (dispDevice.StateFlags & DISPLAY_DEVICE_MIRRORING_DRIVER) { + continue; + } + + hGL->hdc = CreateDC(NULL, dispDevice.DeviceName, NULL, NULL); + if (!hGL->hdc) { + continue; + } + + pfmt = ChoosePixelFormat(hGL->hdc, &pfd); + if (pfmt == 0) { + printf("Failed choosing the requested PixelFormat.\n"); + return false; + } + + glErr = SetPixelFormat(hGL->hdc, pfmt, &pfd); + if (glErr == FALSE) { + printf("Failed to set the requested PixelFormat.\n"); + return false; + } + + hGL->hglrc = wglCreateContext(hGL->hdc); + if (NULL == hGL->hglrc) { + printf("wglCreateContext() failed\n"); + return false; + } + + glErr = wglMakeCurrent(hGL->hdc, hGL->hglrc); + if (FALSE == glErr) { + printf("wglMakeCurrent() failed\n"); + return false; + } + + if (!checkAssociationDeviceWithGLContext(hGL)) { + deleteGLContext(hGL); + return false; + } + + return true; + } // for (deviceNum = 0; EnumDisplayDevices(NULL, deviceNum, &dispDevice, + // 0); deviceNum++) { + + return false; +} + +bool OCLGLCommon::checkAssociationDeviceWithGLContext(OCLGLHandle& hGL) { + bool ret = false; + size_t devicesSize = 0; + cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)platform_, + CL_GL_CONTEXT_KHR, + (cl_context_properties)hGL->hglrc, + CL_WGL_HDC_KHR, + (cl_context_properties)hGL->hdc, + 0}; + + error_ = _wrapper->clGetGLContextInfoKHR( + properties, CL_DEVICES_FOR_GL_CONTEXT_KHR, 0, NULL, &devicesSize); + if (error_ != CL_SUCCESS) { + printf("clGetGLContextInfoKHR failed (%d)\n", error_); + return false; + } + + cl_uint numDevices = (cl_uint)devicesSize / sizeof(cl_device_id); + cl_device_id* interopDevices = (cl_device_id*)malloc(devicesSize); + + error_ = + _wrapper->clGetGLContextInfoKHR(properties, CL_DEVICES_FOR_GL_CONTEXT_KHR, + devicesSize, interopDevices, NULL); + if (error_ != CL_SUCCESS) { + printf("clGetGLContextInfoKHR failed (%d)\n", error_); + free(interopDevices); + return false; + } + + // Check that current device can be associated with OpenGL context + for (unsigned int i = 0; i < numDevices; i++) { + if (interopDevices[i] == devices_[_deviceId]) { + ret = true; + break; + } + } + + free(interopDevices); + return ret; +} + +void OCLGLCommon::createCLContextFromGLContext(OCLGLHandle& hGL) { + cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)platform_, + CL_GL_CONTEXT_KHR, + (cl_context_properties)hGL->hglrc, + CL_WGL_HDC_KHR, + (cl_context_properties)hGL->hdc, + 0}; + + // Release current command queue + if (cmdQueues_[_deviceId]) { + error_ = _wrapper->clReleaseCommandQueue(cmdQueues_[_deviceId]); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), + "clReleaseCommandQueue() failed"); + } + + // Release current context + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseContext() failed"); + } + + // Create new CL context from GL context + context_ = + clCreateContext(properties, 1, &devices_[_deviceId], NULL, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext() failed (%d)", error_); + + // Create command queue for new context + cmdQueues_[_deviceId] = + _wrapper->clCreateCommandQueue(context_, devices_[_deviceId], 0, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed (%d)", + error_); + + GLenum glErr = glewInit(); + CHECK_RESULT((glErr != GLEW_OK), "glewInit() failed"); +} + +void OCLGLCommon::makeCurrent(OCLGLHandle hGL) { + if (hGL == NULL) { + wglMakeCurrent(NULL, NULL); + } else { + wglMakeCurrent(hGL->hdc, hGL->hglrc); + } +} + +void OCLGLCommon::getCLContextPropertiesFromGLContext( + const OCLGLHandle hGL, cl_context_properties properties[7]) { + if (!properties) return; + + properties[0] = CL_CONTEXT_PLATFORM; + properties[1] = (cl_context_properties)platform_; + properties[2] = CL_GL_CONTEXT_KHR; + properties[3] = (cl_context_properties)hGL->hglrc; + properties[4] = CL_WGL_HDC_KHR; + properties[5] = (cl_context_properties)hGL->hdc; + properties[6] = 0; +} diff --git a/projects/clr/opencl/tests/ocltst/module/common/OCLTestImp.cpp b/projects/clr/opencl/tests/ocltst/module/common/OCLTestImp.cpp new file mode 100644 index 0000000000..70d33ecb10 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/common/OCLTestImp.cpp @@ -0,0 +1,288 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLTestImp.h" + +#include +#include +#include + +#include +#include + +///////////////////////////////////////////////////////////////////////////// + +static unsigned int crcinit(unsigned int crc); +static int initializeSeed(void); + +///////////////////////////////////////////////////////////////////////////// + +OCLutil::Lock OCLTestImp::openDeviceLock; +OCLutil::Lock OCLTestImp::compileLock; + +OCLTestImp::OCLTestImp() + : _wrapper(0), + _seed(0), + error_(0), + type_(0), + deviceCount_(0), + devices_(0), + platform_(0), + context_(0), + program_(0), + kernel_(0) { + unsigned int i; + for (i = 0; i < 256; i++) { + _crctab[i] = crcinit(i << 24); + } + _perfInfo = 0; + + _wrapper = 0; + _iterationCnt = 0; + + _seed = initializeSeed(); + + _errorMsg = ""; + _errorFlag = false; + type_ = CL_DEVICE_TYPE_GPU; +} + +OCLTestImp::~OCLTestImp() {} +void OCLTestImp::useCPU() { type_ = CL_DEVICE_TYPE_CPU; } +void OCLTestImp::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + devices_ = 0; + context_ = 0; + program_ = 0; + kernel_ = 0; + deviceCount_ = 0; + + open(test, units, conversion, deviceId, getPlatformIndex()); +} +void OCLTestImp::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId, unsigned int platformIndex) { + BaseTestImp::open(); + devices_ = 0; + deviceCount_ = 0; + context_ = 0; + program_ = 0; + kernel_ = 0; + _deviceId = deviceId; + _platformIndex = platformIndex; + + cl_uint numPlatforms = 0; + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT((error_ != CL_SUCCESS), "clGetPlatformIDs failed"); + CHECK_RESULT((numPlatforms == 0), "No platform found"); + + cl_platform_id* platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + + cl_platform_id platform = 0; +#if 0 + for(unsigned int i = 0; i < numPlatforms; ++i) + { + char buff[200]; + error_ = _wrapper->clGetPlatformInfo(platforms[i],CL_PLATFORM_VENDOR, sizeof(buff), buff, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed"); + if(strcmp(buff, "Advanced Micro Devices, Inc.") == 0) + { + platform = platforms[i]; + break; + } + } +#endif + platform = platforms[_platformIndex]; + + delete[] platforms; + + CHECK_RESULT((platform == 0), "AMD Platform not found"); + + error_ = _wrapper->clGetDeviceIDs(platform, type_, 0, NULL, &deviceCount_); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed"); + + devices_ = new cl_device_id[deviceCount_]; + error_ = + _wrapper->clGetDeviceIDs(platform, type_, deviceCount_, devices_, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed"); + + cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)platform, 0}; + context_ = _wrapper->clCreateContext(props, deviceCount_, devices_, NULL, 0, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext failed"); + + cl_command_queue cmdQueue; + for (unsigned int i = 0; i < deviceCount_; ++i) { +#ifndef CL_VERSION_2_0 + cmdQueue = _wrapper->clCreateCommandQueue( + context_, devices_[i], CL_QUEUE_PROFILING_ENABLE, &error_); +#else + cl_queue_properties prop[] = {CL_QUEUE_PROPERTIES, + CL_QUEUE_PROFILING_ENABLE, 0}; + cmdQueue = _wrapper->clCreateCommandQueueWithProperties( + context_, devices_[i], prop, &error_); +#endif + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed"); + cmdQueues_.push_back(cmdQueue); + } + platform_ = platform; +} + +unsigned int OCLTestImp::close() { + for (unsigned int i = 0; i < buffers().size(); ++i) { + error_ = _wrapper->clReleaseMemObject(buffers()[i]); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), + "clReleaseMemObject() failed"); + } + buffers_.clear(); + + if (kernel_ != 0) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseKernel() failed"); + } + + if (program_ != 0) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseProgram() failed"); + } + + for (unsigned int i = 0; i < cmdQueues_.size(); ++i) { + error_ = _wrapper->clReleaseCommandQueue(cmdQueues_[i]); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), + "clReleaseCommandQueue() failed"); + } + cmdQueues_.clear(); + + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseContext() failed"); + } + + if (devices_) { + delete[] devices_; + } + + return BaseTestImp::close(); +} + +int OCLTestImp::genBitRand(int n) { + int rslt; + if (n <= 0 || n > 32) { + assert(0); + rslt = 0; + } else if (n < 32) { + _seed = _seed * 1103515245 + 12345; + /* + * return the most-significant n bits; they are the random ones (see + * Knuth, Vol 2) + */ + rslt = (_seed & 0x7fffffff) >> (31 - n); + } else { + rslt = (genBitRand(16) << 16) | genBitRand(16); + } + + return rslt; +} + +int OCLTestImp::genIntRand(int a, int b) { + int r; + int sign = 1; + int mySmall; + int delta; + int bits = 0; + int rslt; + if (a > b) { + mySmall = b; + delta = a - b; + } else { + mySmall = a; + delta = b - a; + } + if (delta == 0) { + rslt = a; + return (rslt); + } else if (delta < 0) { + sign = -1; + delta = -delta; + } + delta &= 0x7fffffff; + for (r = delta; r > 0; r >>= 1) { + bits++; + } + do { + r = genBitRand(bits); + } while (r > delta); + + rslt = mySmall + r * sign; + + return (rslt); +} + +void OCLTestImp::setOCLWrapper(OCLWrapper* wrapper) { _wrapper = wrapper; } + +///////////////////////////////////////////////////////////////////////////// + +#ifdef ATI_OS_WIN + +#include + +static int initializeSeed(void) { + __int64 val; + QueryPerformanceCounter((LARGE_INTEGER*)&val); + return (int)val; +} + +#endif // ATI_OS_WIN + +///////////////////////////////////////////////////////////////////////////// + +#ifdef ATI_OS_LINUX + +#include + +static int initializeSeed(void) { + struct timeval t; + gettimeofday(&t, 0); + return (int)t.tv_usec; +} + +#endif // ATI_OS_LINUX + +///////////////////////////////////////////////////////////////////////////// +// +// Same CRC32 as used by ogtst +// +static const unsigned int CRCMASK = 0x04c11db7; + +static unsigned int crcinit(unsigned int crc) { + int i; + unsigned int ans = crc; + + for (i = 0; i < 8; i++) { + if (ans & 0x80000000) { + ans = (ans << 1) ^ CRCMASK; + } else { + ans <<= 1; + } + } + return (ans); +} diff --git a/projects/clr/opencl/tests/ocltst/module/common/OCLTestListImp.cpp b/projects/clr/opencl/tests/ocltst/module/common/OCLTestListImp.cpp new file mode 100644 index 0000000000..4398652904 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/common/OCLTestListImp.cpp @@ -0,0 +1,70 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLTestListImp.h" + +#include + +#include "OCLTest.h" + +// +// OCLTestList_TestCount - retrieve the number of tests in the testing module +// +unsigned int OCL_CALLCONV OCLTestList_TestCount(void) { return TestListCount; } + +// +// OCLTestList_TestLibVersion - retrieve the version of test lib in the testing +// module +// +unsigned int OCL_CALLCONV OCLTestList_TestLibVersion(void) { + return TestLibVersion; +} + +// +// OCLTestList_TestLibName - retrieve the name of test library +// +const char* OCL_CALLCONV OCLTestList_TestLibName(void) { return TestLibName; } + +// +// OCLTestList_TestName - retrieve the name of the indexed test in the module +// +const char* OCL_CALLCONV OCLTestList_TestName(unsigned int testNum) { + if (testNum >= OCLTestList_TestCount()) { + return NULL; + } + + return TestList[testNum].name; +} + +// +// OCLTestList_CreateTest - create a test by index +// +OCLTest* OCL_CALLCONV OCLTestList_CreateTest(unsigned int testNum) { + if (testNum >= OCLTestList_TestCount()) { + return NULL; + } + + return reinterpret_cast((*TestList[testNum].create)()); +} + +// +// OCLTestList_DestroyTest - destroy a test object +// +void OCL_CALLCONV OCLTestList_DestroyTest(OCLTest* test) { delete test; } diff --git a/projects/clr/opencl/tests/ocltst/module/common/OCLTestUtils.cpp b/projects/clr/opencl/tests/ocltst/module/common/OCLTestUtils.cpp new file mode 100644 index 0000000000..e5b341956c --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/common/OCLTestUtils.cpp @@ -0,0 +1,46 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLTestUtils.h" + +#include +#include + +bool loadFile(const char* filename, std::string& s) { + size_t size; + char* str; + std::fstream f(filename, std::fstream::in | std::fstream::binary); + + if (f.is_open()) { + size_t fileSize; + f.seekg(0, std::fstream::end); + size = fileSize = (size_t)f.tellg(); + f.seekg(0, std::fstream::beg); + str = new char[size + 1]; + f.read(str, fileSize); + f.close(); + str[size] = '\0'; + s = str; + delete[] str; + return true; + } + std::cerr << "Error: failed to open file: " << filename << '\n'; + return false; +} diff --git a/projects/clr/opencl/tests/ocltst/module/common/OCLThread.cpp b/projects/clr/opencl/tests/ocltst/module/common/OCLThread.cpp new file mode 100644 index 0000000000..051f565b9d --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/common/OCLThread.cpp @@ -0,0 +1,209 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +//! +//! \file OCLThread.cpp +//! + +#include +#include + +#include "OCL/Thread.h" +#ifdef ATI_OS_WIN +#include +#endif + +//! pack the function pointer and data inside this struct +typedef struct __argsToThreadFunc { + oclThreadFunc func; + void *data; + +} argsToThreadFunc; + +#ifdef ATI_OS_WIN +//! Windows thread callback - invokes the callback set by +//! the application in OCLThread constructor +unsigned _stdcall win32ThreadFunc(void *args) { + argsToThreadFunc *ptr = (argsToThreadFunc *)args; + OCLutil::Thread *obj = (OCLutil::Thread *)ptr->data; + ptr->func(obj->getData()); + delete args; + return 0; +} +#endif + +//////////////////////////////////////////////////////////////////// +//! +//! Constructor for OCLLock +//! +OCLutil::Lock::Lock() { +#ifdef ATI_OS_WIN + InitializeCriticalSection(&_cs); +#else + pthread_mutex_init(&_lock, NULL); +#endif +} + +//////////////////////////////////////////////////////////////////// +//! +//! Destructor for OCLLock +//! +OCLutil::Lock::~Lock() { +#ifdef ATI_OS_WIN + DeleteCriticalSection(&_cs); +#else + pthread_mutex_destroy(&_lock); +#endif +} + +////////////////////////////////////////////////////////////// +//! +//! Try to acquire the lock, wait for the lock if unavailable +//! else hold the lock and enter the protected area +//! +void OCLutil::Lock::lock() { +#ifdef ATI_OS_WIN + EnterCriticalSection(&_cs); +#else + pthread_mutex_lock(&_lock); +#endif +} + +////////////////////////////////////////////////////////////// +//! +//! Try to acquire the lock, if unavailable the function returns +//! false and returns true if available(enters the critical +//! section as well in this case). +//! +bool OCLutil::Lock::tryLock() { +#ifdef ATI_OS_WIN + return (TryEnterCriticalSection(&_cs) != 0); +#else + return !((bool)pthread_mutex_trylock(&_lock)); +#endif +} + +////////////////////////////////////////////////////////////// +//! +//! Unlock the lock +//! +void OCLutil::Lock::unlock() { +#ifdef ATI_OS_WIN + LeaveCriticalSection(&_cs); +#else + pthread_mutex_unlock(&_lock); +#endif +} + +//////////////////////////////////////////////////////////////////// +//! +//! Constructor for OCLThread +//! +OCLutil::Thread::Thread() : _tid(0), _data(0) { +#ifdef ATI_OS_WIN + _ID = 0; +#else +#endif +} + +//////////////////////////////////////////////////////////////////// +//! +//! Destructor for OCLLock +//! +OCLutil::Thread::~Thread() { +#ifdef ATI_OS_WIN + CloseHandle(_tid); +#else +#endif +} + +////////////////////////////////////////////////////////////// +//! +//! Create a new thread and return the status of the operation +//! +bool OCLutil::Thread::create(oclThreadFunc func, void *arg) { + // Save the data internally + _data = arg; + + unsigned int retVal; + + bool verbose = getenv("VERBOSE") != NULL; + +#ifdef ATI_OS_WIN + // Setup the callback struct for thread function and pass to the + // begin thread routine + // xxx The following struct is allocated but never freed!!!! + argsToThreadFunc *args = new argsToThreadFunc; + args->func = func; + args->data = this; + + _tid = (HANDLE)_beginthreadex(NULL, 0, win32ThreadFunc, args, 0, &retVal); + + if (verbose) { + printf("Thread handle value = %p\n", _tid); + + printf("Done creating thread. Thread id value = %u\n", retVal); + } +#else + //! Now create the thread with pointer to self as the data + retVal = pthread_create(&_tid, NULL, func, arg); + + if (verbose) + printf("Done creating thread. Ret value %d, Self = %u\n", retVal, + (unsigned int)pthread_self()); +#endif + + if (retVal != 0) return false; + + return true; +} + +////////////////////////////////////////////////////////////// +//! +//! Return the thread ID for the current OCLThread +//! +unsigned int OCLutil::Thread::getID() { +#ifdef ATI_OS_WIN + return GetCurrentThreadId(); + // Type cast the thread handle to unsigned in and send it over +#else + return (unsigned int)pthread_self(); +#endif +} + +////////////////////////////////////////////////////////////// +//! +//! Wait for this thread to join +//! +bool OCLutil::Thread::join() { +#ifdef ATI_OS_WIN + DWORD rc = WaitForSingleObject(_tid, INFINITE); + + if (rc == WAIT_FAILED) { + printf("Bad call to function(invalid handle?)\n"); + } +#else + int rc = pthread_join(_tid, NULL); +#endif + + if (rc != 0) return false; + + return true; +} diff --git a/projects/clr/opencl/tests/ocltst/module/common/OCLWrapper.cpp b/projects/clr/opencl/tests/ocltst/module/common/OCLWrapper.cpp new file mode 100644 index 0000000000..f78fd73287 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/common/OCLWrapper.cpp @@ -0,0 +1,944 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLWrapper.h" + +OCLWrapper::OCLWrapper() { + clEnqueueWaitSignalAMD_ptr = + (clEnqueueWaitSignalAMD_fn)clGetExtensionFunctionAddress( + "clEnqueueWaitSignalAMD"); + clEnqueueWriteSignalAMD_ptr = + (clEnqueueWriteSignalAMD_fn)clGetExtensionFunctionAddress( + "clEnqueueWriteSignalAMD"); + clEnqueueMakeBuffersResidentAMD_ptr = + (clEnqueueMakeBuffersResidentAMD_fn)clGetExtensionFunctionAddress( + "clEnqueueMakeBuffersResidentAMD"); + + clUnloadPlatformAMD_ptr = + (clUnloadPlatformAMD_fn)clGetExtensionFunctionAddress( + "clUnloadPlatformAMD"); + + // CL-GL function pointers + clGetGLContextInfoKHR_ptr = + (clGetGLContextInfoKHR_fn)clGetExtensionFunctionAddress( + "clGetGLContextInfoKHR"); + clCreateFromGLBuffer_ptr = + (clCreateFromGLBuffer_fn)clGetExtensionFunctionAddress( + "clCreateFromGLBuffer"); + clCreateFromGLTexture_ptr = + (clCreateFromGLTexture_fn)clGetExtensionFunctionAddress( + "clCreateFromGLTexture"); + clCreateFromGLTexture2D_ptr = + (clCreateFromGLTexture2D_fn)clGetExtensionFunctionAddress( + "clCreateFromGLTexture2D"); + clCreateFromGLRenderbuffer_ptr = + (clCreateFromGLRenderbuffer_fn)clGetExtensionFunctionAddress( + "clCreateFromGLRenderbuffer"); + clGetGLObjectInfo_ptr = + (clGetGLObjectInfo_fn)clGetExtensionFunctionAddress("clGetGLObjectInfo"); + clGetGLTextureInfo_ptr = (clGetGLTextureInfo_fn)clGetExtensionFunctionAddress( + "clGetGLTextureInfo"); + clEnqueueAcquireGLObjects_ptr = + (clEnqueueAcquireGLObjects_fn)clGetExtensionFunctionAddress( + "clEnqueueAcquireGLObjects"); + clEnqueueReleaseGLObjects_ptr = + (clEnqueueReleaseGLObjects_fn)clGetExtensionFunctionAddress( + "clEnqueueReleaseGLObjects"); + + // Performance counter function pointers + clCreatePerfCounterAMD_ptr = + (clCreatePerfCounterAMD_fn)clGetExtensionFunctionAddress( + "clCreatePerfCounterAMD"); + clEnqueueBeginPerfCounterAMD_ptr = + (clEnqueueBeginPerfCounterAMD_fn)clGetExtensionFunctionAddress( + "clEnqueueBeginPerfCounterAMD"); + clEnqueueEndPerfCounterAMD_ptr = + (clEnqueueEndPerfCounterAMD_fn)clGetExtensionFunctionAddress( + "clEnqueueEndPerfCounterAMD"); + clGetPerfCounterInfoAMD_ptr = + (clGetPerfCounterInfoAMD_fn)clGetExtensionFunctionAddress( + "clGetPerfCounterInfoAMD"); + clReleasePerfCounterAMD_ptr = + (clReleasePerfCounterAMD_fn)clGetExtensionFunctionAddress( + "clReleasePerfCounterAMD"); + clRetainPerfCounterAMD_ptr = + (clRetainPerfCounterAMD_fn)clGetExtensionFunctionAddress( + "clRetainPerfCounterAMD"); + clSetDeviceClockModeAMD_ptr = + (clSetDeviceClockModeAMD_fn)clGetExtensionFunctionAddress( + "clSetDeviceClockModeAMD"); +} + +cl_int OCLWrapper::clGetPlatformIDs(cl_uint num_entries, + cl_platform_id *platforms, + cl_uint *num_platforms) { + return ::clGetPlatformIDs(num_entries, platforms, num_platforms); +} + +cl_int OCLWrapper::clGetPlatformInfo(cl_platform_id platform, + cl_platform_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + return ::clGetPlatformInfo(platform, param_name, param_value_size, + param_value, param_value_size_ret); +} + +cl_int OCLWrapper::clGetDeviceIDs(cl_platform_id platform, + cl_device_type device_type, + cl_uint num_entries, cl_device_id *devices, + cl_uint *num_devices) { + return ::clGetDeviceIDs(platform, device_type, num_entries, devices, + num_devices); +} + +cl_int OCLWrapper::clGetDeviceInfo(cl_device_id device, + cl_device_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + return ::clGetDeviceInfo(device, param_name, param_value_size, param_value, + param_value_size_ret); +} + +cl_context OCLWrapper::clCreateContext( + cl_context_properties *properties, cl_uint num_devices, + const cl_device_id *devices, + void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), + void *user_data, cl_int *errcode_ret) { + return ::clCreateContext(properties, num_devices, devices, pfn_notify, + user_data, errcode_ret); +} + +cl_context OCLWrapper::clCreateContextFromType( + cl_context_properties *properties, cl_device_type device_type, + void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), + void *user_data, cl_int *errcode_ret) { + return ::clCreateContextFromType(properties, device_type, pfn_notify, + user_data, errcode_ret); +} + +cl_int OCLWrapper::clRetainContext(cl_context context) { + return ::clRetainContext(context); +} + +cl_int OCLWrapper::clReleaseContext(cl_context context) { + return ::clReleaseContext(context); +} + +cl_int OCLWrapper::clGetContextInfo(cl_context context, + cl_context_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + return ::clGetContextInfo(context, param_name, param_value_size, param_value, + param_value_size_ret); +} + +cl_command_queue OCLWrapper::clCreateCommandQueue( + cl_context context, cl_device_id device, + cl_command_queue_properties properties, cl_int *errcode_ret) { +#if defined(CL_VERSION_2_0) + cl_int err; + cl_platform_id pid; + bool version20 = true; + err = ::clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), + &pid, NULL); + if (err == CL_SUCCESS) { + size_t size; + char *ver; + err = ::clGetPlatformInfo(pid, CL_PLATFORM_VERSION, 0, NULL, &size); + if (err == CL_SUCCESS) { + ver = new char[size]; + if (ver) { + err = ::clGetPlatformInfo(pid, CL_PLATFORM_VERSION, size, ver, NULL); + if (err == CL_SUCCESS) { + if (ver[8] == '1') { + version20 = false; + } + } + delete[] ver; + } + } + } + if (version20) { + const cl_queue_properties cprops[] = { + CL_QUEUE_PROPERTIES, static_cast(properties), 0}; + return ::clCreateCommandQueueWithProperties( + context, device, properties ? cprops : NULL, errcode_ret); + } else { + return ::clCreateCommandQueue(context, device, properties, errcode_ret); + } +#else + return ::clCreateCommandQueue(context, device, properties, errcode_ret); +#endif +} + +cl_int OCLWrapper::clRetainCommandQueue(cl_command_queue command_queue) { + return ::clRetainCommandQueue(command_queue); +} + +cl_int OCLWrapper::clReleaseCommandQueue(cl_command_queue command_queue) { + return ::clReleaseCommandQueue(command_queue); +} + +cl_int OCLWrapper::clGetCommandQueueInfo(cl_command_queue command_queue, + cl_command_queue_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { + return ::clGetCommandQueueInfo(command_queue, param_name, param_value_size, + param_value, param_value_size_ret); +} + +cl_mem OCLWrapper::clCreateBuffer(cl_context context, cl_mem_flags flags, + size_t size, void *host_ptr, + cl_int *errcode_ret) { + return ::clCreateBuffer(context, flags, size, host_ptr, errcode_ret); +} + +cl_mem OCLWrapper::clCreateImage2D(cl_context context, cl_mem_flags flags, + const cl_image_format *image_format, + size_t image_width, size_t image_height, + size_t image_row_pitch, void *host_ptr, + cl_int *errcode_ret) { + return ::clCreateImage2D(context, flags, image_format, image_width, + image_height, image_row_pitch, host_ptr, + errcode_ret); +} + +cl_mem OCLWrapper::clCreateImage3D(cl_context context, cl_mem_flags flags, + const cl_image_format *image_format, + size_t image_width, size_t image_height, + size_t image_depth, size_t image_row_pitch, + size_t image_slice_pitch, void *host_ptr, + cl_int *errcode_ret) { + return ::clCreateImage3D(context, flags, image_format, image_width, + image_height, image_depth, image_row_pitch, + image_slice_pitch, host_ptr, errcode_ret); +} + +cl_int OCLWrapper::clRetainMemObject(cl_mem memobj) { + return ::clRetainMemObject(memobj); +} + +cl_int OCLWrapper::clReleaseMemObject(cl_mem memobj) { + return ::clReleaseMemObject(memobj); +} + +cl_int OCLWrapper::clGetSupportedImageFormats(cl_context context, + cl_mem_flags flags, + cl_mem_object_type image_type, + cl_uint num_entries, + cl_image_format *image_formats, + cl_uint *num_image_formats) { + return ::clGetSupportedImageFormats(context, flags, image_type, num_entries, + image_formats, num_image_formats); +} + +cl_int OCLWrapper::clGetMemObjectInfo(cl_mem memobj, cl_mem_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { + return ::clGetMemObjectInfo(memobj, param_name, param_value_size, param_value, + param_value_size_ret); +} + +cl_int OCLWrapper::clGetImageInfo(cl_mem image, cl_image_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + return ::clGetImageInfo(image, param_name, param_value_size, param_value, + param_value_size_ret); +} + +cl_sampler OCLWrapper::clCreateSampler(cl_context context, + cl_bool normalized_coords, + cl_addressing_mode addressing_mode, + cl_filter_mode filter_mode, + cl_int *errcode_ret) { +#ifdef CL_VERSION_2_0 + const cl_sampler_properties sprops[] = { + CL_SAMPLER_NORMALIZED_COORDS, + static_cast(normalized_coords), + CL_SAMPLER_ADDRESSING_MODE, + static_cast(addressing_mode), + CL_SAMPLER_FILTER_MODE, + static_cast(filter_mode), + 0}; + return ::clCreateSamplerWithProperties(context, sprops, errcode_ret); +#else + return ::clCreateSampler(context, normalized_coords, addressing_mode, + filter_mode, errcode_ret); +#endif +} + +cl_int OCLWrapper::clRetainSampler(cl_sampler sampler) { + return ::clRetainSampler(sampler); +} + +cl_int OCLWrapper::clReleaseSampler(cl_sampler sampler) { + return ::clReleaseSampler(sampler); +} + +cl_int OCLWrapper::clGetSamplerInfo(cl_sampler sampler, + cl_sampler_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + return ::clGetSamplerInfo(sampler, param_name, param_value_size, param_value, + param_value_size_ret); +} + +cl_program OCLWrapper::clCreateProgramWithSource(cl_context context, + cl_uint count, + const char **strings, + const size_t *lengths, + cl_int *errcode_ret) { + return ::clCreateProgramWithSource(context, count, strings, lengths, + errcode_ret); +} + +cl_program OCLWrapper::clCreateProgramWithBinary( + cl_context context, cl_uint num_devices, const cl_device_id *device_list, + const size_t *lengths, const unsigned char **binaries, + cl_int *binary_status, cl_int *errcode_ret) { + return ::clCreateProgramWithBinary(context, num_devices, device_list, lengths, + binaries, binary_status, errcode_ret); +} + +cl_int OCLWrapper::clRetainProgram(cl_program program) { + return ::clRetainProgram(program); +} + +cl_int OCLWrapper::clReleaseProgram(cl_program program) { + return ::clReleaseProgram(program); +} + +cl_int OCLWrapper::clBuildProgram( + cl_program program, cl_uint num_devices, const cl_device_id *device_list, + const char *options, + void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), + void *user_data) { + return ::clBuildProgram(program, num_devices, device_list, options, + pfn_notify, user_data); +} + +cl_int OCLWrapper::clCompileProgram( + cl_program program, cl_uint num_devices, const cl_device_id *device_list, + const char *options, cl_uint num_input_headers, + const cl_program *input_headers, const char **header_include_names, + void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), + void *user_data) { + return ::clCompileProgram(program, num_devices, device_list, options, + num_input_headers, input_headers, + header_include_names, pfn_notify, user_data); +} + +cl_program OCLWrapper::clLinkProgram( + cl_context context, cl_uint num_devices, const cl_device_id *device_list, + const char *options, cl_uint num_input_programs, + const cl_program *input_programs, + void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), + void *user_data, cl_int *errcode_ret) { + return ::clLinkProgram(context, num_devices, device_list, options, + num_input_programs, input_programs, pfn_notify, + user_data, errcode_ret); +} + +cl_int OCLWrapper::clUnloadCompiler(void) { return ::clUnloadCompiler(); } + +cl_int OCLWrapper::clGetProgramInfo(cl_program program, + cl_program_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + return ::clGetProgramInfo(program, param_name, param_value_size, param_value, + param_value_size_ret); +} + +cl_int OCLWrapper::clGetProgramBuildInfo( + cl_program program, cl_device_id device, cl_program_build_info param_name, + size_t param_value_size, void *param_value, size_t *param_value_size_ret) { + return ::clGetProgramBuildInfo(program, device, param_name, param_value_size, + param_value, param_value_size_ret); +} + +cl_kernel OCLWrapper::clCreateKernel(cl_program program, + const char *kernel_name, + cl_int *errcode_ret) { + return ::clCreateKernel(program, kernel_name, errcode_ret); +} + +cl_int OCLWrapper::clCreateKernelsInProgram(cl_program program, + cl_uint num_kernels, + cl_kernel *kernels, + cl_uint *num_kernels_ret) { + return ::clCreateKernelsInProgram(program, num_kernels, kernels, + num_kernels_ret); +} + +cl_int OCLWrapper::clRetainKernel(cl_kernel kernel) { + return ::clRetainKernel(kernel); +} + +cl_int OCLWrapper::clReleaseKernel(cl_kernel kernel) { + return ::clReleaseKernel(kernel); +} + +cl_int OCLWrapper::clSetKernelArg(cl_kernel kernel, cl_uint arg_index, + size_t arg_size, const void *arg_value) { + return ::clSetKernelArg(kernel, arg_index, arg_size, arg_value); +} + +cl_int OCLWrapper::clGetKernelInfo(cl_kernel kernel, cl_kernel_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + return ::clGetKernelInfo(kernel, param_name, param_value_size, param_value, + param_value_size_ret); +} + +cl_int OCLWrapper::clGetKernelWorkGroupInfo( + cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name, + size_t param_value_size, void *param_value, size_t *param_value_size_ret) { + return ::clGetKernelWorkGroupInfo(kernel, device, param_name, + param_value_size, param_value, + param_value_size_ret); +} + +cl_int OCLWrapper::clWaitForEvents(cl_uint num_events, + const cl_event *event_list) { + return ::clWaitForEvents(num_events, event_list); +} + +cl_int OCLWrapper::clGetEventInfo(cl_event evnt, cl_event_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + return ::clGetEventInfo(evnt, param_name, param_value_size, param_value, + param_value_size_ret); +} + +cl_int OCLWrapper::clRetainEvent(cl_event evnt) { + return ::clRetainEvent(evnt); +} + +cl_int OCLWrapper::clReleaseEvent(cl_event evnt) { + return ::clReleaseEvent(evnt); +} + +cl_int OCLWrapper::clGetEventProfilingInfo(cl_event evnt, + cl_profiling_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { + return ::clGetEventProfilingInfo(evnt, param_name, param_value_size, + param_value, param_value_size_ret); +} + +cl_int OCLWrapper::clFlush(cl_command_queue command_queue) { + return ::clFlush(command_queue); +} + +cl_int OCLWrapper::clFinish(cl_command_queue command_queue) { + return ::clFinish(command_queue); +} + +cl_int OCLWrapper::clEnqueueReadBuffer(cl_command_queue command_queue, + cl_mem buffer, cl_bool blocking_read, + size_t offset, size_t cb, void *ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *evnt) { + return ::clEnqueueReadBuffer(command_queue, buffer, blocking_read, offset, cb, + ptr, num_events_in_wait_list, event_wait_list, + evnt); +} + +cl_int OCLWrapper::clEnqueueWriteBuffer( + cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, + size_t offset, size_t cb, const void *ptr, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt) { + return ::clEnqueueWriteBuffer(command_queue, buffer, blocking_write, offset, + cb, ptr, num_events_in_wait_list, + event_wait_list, evnt); +} + +cl_int OCLWrapper::clEnqueueCopyBuffer(cl_command_queue command_queue, + cl_mem src_buffer, cl_mem dst_buffer, + size_t src_offset, size_t dst_offset, + size_t cb, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *evnt) { + return ::clEnqueueCopyBuffer(command_queue, src_buffer, dst_buffer, + src_offset, dst_offset, cb, + num_events_in_wait_list, event_wait_list, evnt); +} + +cl_int OCLWrapper::clEnqueueReadBufferRect( + cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, + const size_t *buffer_origin, const size_t *host_origin, + const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch, + size_t host_row_pitch, size_t host_slice_pitch, void *ptr, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *evnt) { + return ::clEnqueueReadBufferRect( + command_queue, buffer, blocking_read, buffer_origin, host_origin, region, + buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, + ptr, num_events_in_wait_list, event_wait_list, evnt); +} + +cl_int OCLWrapper::clEnqueueWriteBufferRect( + cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, + const size_t *buffer_origin, const size_t *host_origin, + const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch, + size_t host_row_pitch, size_t host_slice_pitch, const void *ptr, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *evnt) { + return ::clEnqueueWriteBufferRect( + command_queue, buffer, blocking_write, buffer_origin, host_origin, region, + buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, + ptr, num_events_in_wait_list, event_wait_list, evnt); +} + +cl_int OCLWrapper::clEnqueueCopyBufferRect( + cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, + const size_t *src_origin, const size_t *dst_origin, const size_t *region, + size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, + size_t dst_slice_pitch, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt) { + return ::clEnqueueCopyBufferRect( + command_queue, src_buffer, dst_buffer, src_origin, dst_origin, region, + src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch, + num_events_in_wait_list, event_wait_list, evnt); +} + +cl_int OCLWrapper::clEnqueueReadImage( + cl_command_queue command_queue, cl_mem image, cl_bool blocking_read, + const size_t *origin, const size_t *region, size_t row_pitch, + size_t slice_pitch, void *ptr, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt) { + return ::clEnqueueReadImage(command_queue, image, blocking_read, origin, + region, row_pitch, slice_pitch, ptr, + num_events_in_wait_list, event_wait_list, evnt); +} + +cl_int OCLWrapper::clEnqueueWriteImage( + cl_command_queue command_queue, cl_mem image, cl_bool blocking_write, + const size_t *origin, const size_t *region, size_t input_row_pitch, + size_t input_slice_pitch, const void *ptr, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt) { + return ::clEnqueueWriteImage(command_queue, image, blocking_write, origin, + region, input_row_pitch, input_slice_pitch, ptr, + num_events_in_wait_list, event_wait_list, evnt); +} + +cl_int OCLWrapper::clEnqueueCopyImage( + cl_command_queue command_queue, cl_mem src_image, cl_mem dst_image, + const size_t *src_origin, const size_t *dst_origin, const size_t *region, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *evnt) { + return ::clEnqueueCopyImage(command_queue, src_image, dst_image, src_origin, + dst_origin, region, num_events_in_wait_list, + event_wait_list, evnt); +} + +cl_int OCLWrapper::clEnqueueCopyImageToBuffer( + cl_command_queue command_queue, cl_mem src_image, cl_mem dst_buffer, + const size_t *src_origin, const size_t *region, size_t dst_offset, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *evnt) { + return ::clEnqueueCopyImageToBuffer( + command_queue, src_image, dst_buffer, src_origin, region, dst_offset, + num_events_in_wait_list, event_wait_list, evnt); +} + +cl_int OCLWrapper::clEnqueueCopyBufferToImage( + cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image, + size_t src_offset, const size_t *dst_origin, const size_t *region, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *evnt) { + return ::clEnqueueCopyBufferToImage( + command_queue, src_buffer, dst_image, src_offset, dst_origin, region, + num_events_in_wait_list, event_wait_list, evnt); +} + +void *OCLWrapper::clEnqueueMapBuffer(cl_command_queue command_queue, + cl_mem buffer, cl_bool blocking_map, + cl_map_flags map_flags, size_t offset, + size_t cb, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *evnt, cl_int *errcode_ret) { + return ::clEnqueueMapBuffer(command_queue, buffer, blocking_map, map_flags, + offset, cb, num_events_in_wait_list, + event_wait_list, evnt, errcode_ret); +} + +void *OCLWrapper::clEnqueueMapImage( + cl_command_queue command_queue, cl_mem image, cl_bool blocking_map, + cl_map_flags map_flags, const size_t *origin, const size_t *region, + size_t *image_row_pitch, size_t *image_slice_pitch, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *evnt, cl_int *errcode_ret) { + return ::clEnqueueMapImage(command_queue, image, blocking_map, map_flags, + origin, region, image_row_pitch, image_slice_pitch, + num_events_in_wait_list, event_wait_list, evnt, + errcode_ret); +} + +cl_int OCLWrapper::clEnqueueUnmapMemObject(cl_command_queue command_queue, + cl_mem memobj, void *mapped_ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *evnt) { + return ::clEnqueueUnmapMemObject(command_queue, memobj, mapped_ptr, + num_events_in_wait_list, event_wait_list, + evnt); +} + +cl_int OCLWrapper::clEnqueueNDRangeKernel( + cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, + const size_t *global_work_offset, const size_t *global_work_size, + const size_t *local_work_size, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt) { + return ::clEnqueueNDRangeKernel( + command_queue, kernel, work_dim, global_work_offset, global_work_size, + local_work_size, num_events_in_wait_list, event_wait_list, evnt); +} + +cl_int OCLWrapper::clEnqueueTask(cl_command_queue command_queue, + cl_kernel kernel, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *evnt) { +#if defined(CL_VERSION_2_0) + static size_t const globalWorkSize[3] = {1, 0, 0}; + static size_t const localWorkSize[3] = {1, 0, 0}; + + return ::clEnqueueNDRangeKernel( + command_queue, kernel, 1, NULL, globalWorkSize, localWorkSize, + num_events_in_wait_list, event_wait_list, evnt); +#else + return ::clEnqueueTask(command_queue, kernel, num_events_in_wait_list, + event_wait_list, evnt); +#endif +} + +cl_int OCLWrapper::clEnqueueNativeKernel( + cl_command_queue command_queue, void(CL_CALLBACK *user_func)(void *), + void *args, size_t cb_args, cl_uint num_mem_objects, const cl_mem *mem_list, + const void **args_mem_loc, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt) { + return ::clEnqueueNativeKernel( + command_queue, user_func, args, cb_args, num_mem_objects, mem_list, + args_mem_loc, num_events_in_wait_list, event_wait_list, evnt); +} + +cl_int OCLWrapper::clEnqueueMarker(cl_command_queue command_queue, + cl_event *evnt) { + return ::clEnqueueMarker(command_queue, evnt); +} + +cl_int OCLWrapper::clEnqueueMarkerWithWaitList(cl_command_queue command_queue, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *evnt) { + return ::clEnqueueMarkerWithWaitList(command_queue, num_events_in_wait_list, + event_wait_list, evnt); +} + +cl_int OCLWrapper::clEnqueueWaitForEvents(cl_command_queue command_queue, + cl_uint num_events, + const cl_event *event_list) { + return ::clEnqueueWaitForEvents(command_queue, num_events, event_list); +} + +cl_int OCLWrapper::clEnqueueBarrier(cl_command_queue command_queue) { + return ::clEnqueueBarrier(command_queue); +} + +void *OCLWrapper::clGetExtensionFunctionAddress(const char *func_name) { + return ::clGetExtensionFunctionAddress(func_name); +} + +cl_mem OCLWrapper::clCreateImage(cl_context context, cl_mem_flags flags, + const cl_image_format *image_format, + const cl_image_desc *image_desc, + void *host_ptr, cl_int *errcode_ret) { + return ::clCreateImage(context, flags, image_format, image_desc, host_ptr, + errcode_ret); +} + +cl_mem OCLWrapper::clCreateSubBuffer(cl_mem mem, cl_mem_flags flags, + cl_buffer_create_type buffer_create_type, + const void *buffer_create_info, + cl_int *errcode_ret) { + return ::clCreateSubBuffer(mem, flags, buffer_create_type, buffer_create_info, + errcode_ret); +} + +cl_int OCLWrapper::clSetEventCallback( + cl_event event, cl_int command_exec_callback_type, + void(CL_CALLBACK *pfn_event_notify)(cl_event event, + cl_int event_command_exec_status, + void *user_data), + void *user_data) { + return ::clSetEventCallback(event, command_exec_callback_type, + pfn_event_notify, user_data); +} + +cl_int OCLWrapper::clEnqueueFillImage( + cl_command_queue command_queue, cl_mem image, void *ptr, + const size_t *origin, const size_t *region, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *evnt) { + return ::clEnqueueFillImage(command_queue, image, ptr, origin, region, + num_events_in_wait_list, event_wait_list, evnt); +} + +cl_int OCLWrapper::clUnloadPlatformAMD(cl_platform_id id) { + if (clUnloadPlatformAMD_ptr) return clUnloadPlatformAMD_ptr(id); + return CL_SUCCESS; +} +cl_int OCLWrapper::clEnqueueWaitSignalAMD(cl_command_queue command_queue, + cl_mem mem_object, cl_uint value, + cl_uint num_events, + const cl_event *event_wait_list, + cl_event *event) { + return clEnqueueWaitSignalAMD_ptr(command_queue, mem_object, value, + num_events, event_wait_list, event); +} + +cl_int OCLWrapper::clEnqueueWriteSignalAMD(cl_command_queue command_queue, + cl_mem mem_object, cl_uint value, + cl_ulong offset, cl_uint num_events, + const cl_event *event_list, + cl_event *event) { + return clEnqueueWriteSignalAMD_ptr(command_queue, mem_object, value, offset, + num_events, event_list, event); +} + +cl_int OCLWrapper::clEnqueueMakeBuffersResidentAMD( + cl_command_queue command_queue, cl_uint num_mem_objs, cl_mem *mem_objects, + cl_bool blocking_make_resident, cl_bus_address_amd *bus_addresses, + cl_uint num_events, const cl_event *event_list, cl_event *event) { + return clEnqueueMakeBuffersResidentAMD_ptr( + command_queue, num_mem_objs, mem_objects, blocking_make_resident, + bus_addresses, num_events, event_list, event); +} + +cl_int OCLWrapper::clEnqueueMigrateMemObjects(cl_command_queue command_queue, + cl_uint num_mem_objects, + const cl_mem *mem_objects, + cl_mem_migration_flags flags, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { + return ::clEnqueueMigrateMemObjects( + command_queue, num_mem_objects, mem_objects, flags, + num_events_in_wait_list, event_wait_list, event); +} + +cl_int OCLWrapper::clGetGLContextInfoKHR( + const cl_context_properties *properties, cl_gl_context_info param_name, + size_t param_value_size, void *param_value, size_t *param_value_size_ret) { + return (*clGetGLContextInfoKHR_ptr)(properties, param_name, param_value_size, + param_value, param_value_size_ret); +} + +cl_mem OCLWrapper::clCreateFromGLBuffer(cl_context context, cl_mem_flags flags, + unsigned int bufobj, int *errcode_ret) { + return (*clCreateFromGLBuffer_ptr)(context, flags, bufobj, errcode_ret); +} + +cl_mem OCLWrapper::clCreateFromGLTexture(cl_context context, cl_mem_flags flags, + unsigned int texture_target, + int miplevel, unsigned int texture, + cl_int *errcode_ret) { + return (*clCreateFromGLTexture_ptr)(context, flags, texture_target, miplevel, + texture, errcode_ret); +} + +cl_mem OCLWrapper::clCreateFromGLTexture2D(cl_context context, + cl_mem_flags flags, + unsigned int texture_target, + int miplevel, unsigned int texture, + cl_int *errcode_ret) { + return (*clCreateFromGLTexture2D_ptr)(context, flags, texture_target, + miplevel, texture, errcode_ret); +} + +cl_mem OCLWrapper::clCreateFromGLRenderbuffer(cl_context context, + cl_mem_flags flags, + unsigned int renderbuffer, + cl_int *errcode_ret) { + return (*clCreateFromGLRenderbuffer_ptr)(context, flags, renderbuffer, + errcode_ret); +} + +cl_int OCLWrapper::clGetGLObjectInfo(cl_mem memobj, + cl_gl_object_type *gl_object_type, + unsigned int *gl_object_name) { + return (*clGetGLObjectInfo_ptr)(memobj, gl_object_type, gl_object_name); +} + +cl_int OCLWrapper::clGetGLTextureInfo(cl_mem memobj, + cl_gl_texture_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { + return (*clGetGLTextureInfo_ptr)(memobj, param_name, param_value_size, + param_value, param_value_size_ret); +} + +cl_int OCLWrapper::clEnqueueAcquireGLObjects(cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem *mem_objects, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { + return (*clEnqueueAcquireGLObjects_ptr)(command_queue, num_objects, + mem_objects, num_events_in_wait_list, + event_wait_list, event); +} + +cl_int OCLWrapper::clEnqueueReleaseGLObjects(cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem *mem_objects, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { + return (*clEnqueueReleaseGLObjects_ptr)(command_queue, num_objects, + mem_objects, num_events_in_wait_list, + event_wait_list, event); +} + +#if defined(CL_VERSION_2_0) +cl_command_queue OCLWrapper::clCreateCommandQueueWithProperties( + cl_context context, cl_device_id device, + const cl_queue_properties *properties, cl_int *errcode_ret) { + return ::clCreateCommandQueueWithProperties(context, device, properties, + errcode_ret); +} + +void *OCLWrapper::clSVMAlloc(cl_context context, cl_svm_mem_flags flags, + size_t size, cl_uint alignment) { + return ::clSVMAlloc(context, flags, size, alignment); +} + +void OCLWrapper::clSVMFree(cl_context context, void *svm_pointer) { + return ::clSVMFree(context, svm_pointer); +} + +cl_int OCLWrapper::clEnqueueSVMMap(cl_command_queue command_queue, + cl_bool blocking_map, cl_map_flags flags, + void *svm_ptr, size_t size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { + return ::clEnqueueSVMMap(command_queue, blocking_map, flags, svm_ptr, size, + num_events_in_wait_list, event_wait_list, event); +} + +cl_int OCLWrapper::clEnqueueSVMUnmap(cl_command_queue command_queue, + void *svm_ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { + return ::clEnqueueSVMUnmap(command_queue, svm_ptr, num_events_in_wait_list, + event_wait_list, event); +} +cl_int OCLWrapper::clEnqueueSVMMemFill(cl_command_queue command_queue, + void *svm_ptr, const void *pattern, + size_t pattern_size, size_t size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { + return ::clEnqueueSVMMemFill(command_queue, svm_ptr, pattern, pattern_size, + size, num_events_in_wait_list, event_wait_list, + event); +} + +cl_int OCLWrapper::clSetKernelArgSVMPointer(cl_kernel kernel, cl_uint arg_index, + const void *arg_value) { + return ::clSetKernelArgSVMPointer(kernel, arg_index, arg_value); +} + +cl_mem OCLWrapper::clCreatePipe(cl_context context, cl_mem_flags flags, + cl_uint packet_size, cl_uint pipe_max_packets, + const cl_pipe_properties *properties, + cl_int *errcode_ret) { + return ::clCreatePipe(context, flags, packet_size, pipe_max_packets, + properties, errcode_ret); +} + +cl_int OCLWrapper::clGetPipeInfo(cl_mem pipe, cl_pipe_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + return ::clGetPipeInfo(pipe, param_name, param_value_size, param_value, + param_value_size_ret); +} + +#endif + +cl_perfcounter_amd OCLWrapper::clCreatePerfCounterAMD( + cl_device_id device, cl_perfcounter_property *properties, + cl_int *errcode_ret) { + return (*clCreatePerfCounterAMD_ptr)(device, properties, errcode_ret); +} + +cl_int OCLWrapper::clEnqueueBeginPerfCounterAMD( + cl_command_queue command_queue, cl_uint num_perf_counters, + cl_perfcounter_amd *perf_counters, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event) { + return (*clEnqueueBeginPerfCounterAMD_ptr)( + command_queue, num_perf_counters, perf_counters, num_events_in_wait_list, + event_wait_list, event); +} + +cl_int OCLWrapper::clEnqueueEndPerfCounterAMD(cl_command_queue command_queue, + cl_uint num_perf_counters, + cl_perfcounter_amd *perf_counters, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { + return (*clEnqueueEndPerfCounterAMD_ptr)( + command_queue, num_perf_counters, perf_counters, num_events_in_wait_list, + event_wait_list, event); +} + +cl_int OCLWrapper::clGetPerfCounterInfoAMD(cl_perfcounter_amd perf_counter, + cl_perfcounter_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { + return (*clGetPerfCounterInfoAMD_ptr)(perf_counter, param_name, + param_value_size, param_value, + param_value_size_ret); +} + +cl_int OCLWrapper::clReleasePerfCounterAMD(cl_perfcounter_amd perf_counter) { + return (*clReleasePerfCounterAMD_ptr)(perf_counter); +} + +cl_int OCLWrapper::clRetainPerfCounterAMD(cl_perfcounter_amd perf_counter) { + return (*clRetainPerfCounterAMD_ptr)(perf_counter); +} + +cl_int OCLWrapper::clSetDeviceClockModeAMD( + cl_device_id device, + cl_set_device_clock_mode_input_amd set_clock_mode_input, + cl_set_device_clock_mode_output_amd *set_clock_mode_output) { + return (*clSetDeviceClockModeAMD_ptr)(device, set_clock_mode_input, + set_clock_mode_output); +} diff --git a/projects/clr/opencl/tests/ocltst/module/common/Timer.cpp b/projects/clr/opencl/tests/ocltst/module/common/Timer.cpp new file mode 100644 index 0000000000..4ee095085f --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/common/Timer.cpp @@ -0,0 +1,112 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "Timer.h" + +#ifdef ATI_OS_WIN +#include +#endif + +#ifdef ATI_OS_LINUX +#include +#define NANOSECONDS_PER_SEC 1000000000 +#endif + +CPerfCounter::CPerfCounter() : _clocks(0), _start(0) { +#ifdef ATI_OS_WIN + + QueryPerformanceFrequency((LARGE_INTEGER *)&_freq); + +#endif + +#ifdef ATI_OS_LINUX + _freq = NANOSECONDS_PER_SEC; +#endif +} + +CPerfCounter::~CPerfCounter() { + // EMPTY! +} + +void CPerfCounter::Start(void) { +#ifdef ATI_OS_WIN + + if (_start) { + MessageBox(NULL, "Bad Perf Counter Start", "Error", MB_OK); + exit(0); + } + QueryPerformanceCounter((LARGE_INTEGER *)&_start); + +#endif +#ifdef ATI_OS_LINUX + + struct timespec s; + clock_gettime(CLOCK_MONOTONIC, &s); + _start = (i64)s.tv_sec * NANOSECONDS_PER_SEC + (i64)s.tv_nsec; + +#endif +} + +void CPerfCounter::Stop(void) { + i64 n; + +#ifdef ATI_OS_WIN + + if (!_start) { + MessageBox(NULL, "Bad Perf Counter Stop", "Error", MB_OK); + exit(0); + } + + QueryPerformanceCounter((LARGE_INTEGER *)&n); + +#endif +#ifdef ATI_OS_LINUX + + struct timespec s; + clock_gettime(CLOCK_MONOTONIC, &s); + n = (i64)s.tv_sec * NANOSECONDS_PER_SEC + (i64)s.tv_nsec; + +#endif + + n -= _start; + _start = 0; + _clocks += n; +} + +void CPerfCounter::Reset(void) { +#ifdef ATI_OS_WIN + if (_start) { + MessageBox(NULL, "Bad Perf Counter Reset", "Error", MB_OK); + exit(0); + } +#endif + _clocks = 0; +} + +double CPerfCounter::GetElapsedTime(void) { +#ifdef ATI_OS_WIN + if (_start) { + MessageBox(NULL, "Trying to get time while still running.", "Error", MB_OK); + exit(0); + } +#endif + + return (double)_clocks / (double)_freq; +} diff --git a/projects/clr/opencl/tests/ocltst/module/common/Timer.h b/projects/clr/opencl/tests/ocltst/module/common/Timer.h new file mode 100644 index 0000000000..fd56fe3b0d --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/common/Timer.h @@ -0,0 +1,46 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _TIMER_H_ +#define _TIMER_H_ + +#ifdef ATI_OS_WIN +typedef __int64 i64; +#endif +#ifdef ATI_OS_LINUX +typedef long long i64; +#endif + +class CPerfCounter { + public: + CPerfCounter(); + ~CPerfCounter(); + void Start(void); + void Stop(void); + void Reset(void); + double GetElapsedTime(void); + + private: + i64 _freq; + i64 _clocks; + i64 _start; +}; + +#endif // _TIMER_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11Common.cpp b/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11Common.cpp new file mode 100644 index 0000000000..1219157d44 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11Common.cpp @@ -0,0 +1,236 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLDX11Common.h" + +#define D3D_FEATURE_LEVEL_11_1 0xb100 + +#define INITPFN(x) \ + x = (x##_fn)clGetExtensionFunctionAddressForPlatform(platform_, #x); \ + if ((x) == NULL) { \ + char* buf = (char*)malloc(4096); \ + _errorFlag = true; \ + int rc = snprintf(buf, 4096, "Failed to get function pointer for %s", #x); \ + assert(rc >= 0 && rc < (int)4096); \ + printf("%s:%d - %s\n", __FILE__, __LINE__, buf); \ + _errorMsg = std::string(buf); \ + _crcword += 1; \ + free(buf); \ + return; \ + } + +OCLDX11Common::OCLDX11Common() : OCLTestImp() { + clGetDeviceIDsFromD3D11KHR = NULL; + clCreateFromD3D11BufferKHR = NULL; + clCreateFromD3D11Texture2DKHR = NULL; + clCreateFromD3D11Texture3DKHR = NULL; + clEnqueueAcquireD3D11ObjectsKHR = NULL; + clEnqueueReleaseD3D11ObjectsKHR = NULL; + clGetPlaneFromImageAMD = NULL; +} + +OCLDX11Common::~OCLDX11Common() {} + +void OCLDX11Common::ExtensionCheck() { + cl_int result = CL_SUCCESS; + char extensions[1024]; + + result = _wrapper->clGetPlatformInfo(platform_, CL_PLATFORM_EXTENSIONS, + sizeof(extensions), extensions, NULL); + CHECK_RESULT(result != CL_SUCCESS, "Failed to list platform extensions."); + + extensionsAvailable = + strstr(extensions, "cl_khr_d3d11_sharing") ? true : false; + if (!extensionsAvailable) { + printf("cl_khr_d3d11_sharing extension is required for this test!\n"); + } + + OSVERSIONINFOEX versionInfo = {0}; + versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX); + versionInfo.dwMajorVersion = 6; + + DWORDLONG conditionMask = 0; + VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, VER_GREATER_EQUAL); + if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION, conditionMask)) { + CHECK_RESULT(!extensionsAvailable, + "Extension should be exported on Windows >= 6"); + } else { + CHECK_RESULT(extensionsAvailable, + "Extension should not be exported on Windows < 6"); + } + + result = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_EXTENSIONS, + sizeof(extensions), extensions, NULL); + CHECK_RESULT(result != CL_SUCCESS, "Failed to list device extensions."); + + extensionsAvailable = strstr(extensions, "cl_amd_planar_yuv") ? true : false; + if (!extensionsAvailable) { + printf("cl_amd_planar_yuv extension is required for this test!\n"); + } +} + +void OCLDX11Common::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + // OpenCL Initialization + // OCLTestImp::open(test, units, conversion, deviceId); + BaseTestImp::open(); + devices_ = 0; + deviceCount_ = 0; + context_ = 0; + program_ = 0; + kernel_ = 0; + _queue = 0; + _deviceId = deviceId; + + dxD3D11Context = NULL; + dxD3D11Device = NULL; + + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test (%d)", error_); + + cl_uint numPlatforms = 0; + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT((error_ != CL_SUCCESS), "clGetPlatformIDs failed"); + CHECK_RESULT((numPlatforms == 0), "No platform found"); + + cl_platform_id* platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + + platform_ = platforms[_platformIndex]; + CHECK_RESULT((platform_ == 0), "AMD Platform not found"); + + delete[] platforms; + + error_ = _wrapper->clGetDeviceIDs(platform_, type_, 0, NULL, &deviceCount_); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed"); + + devices_ = new cl_device_id[deviceCount_]; + error_ = + _wrapper->clGetDeviceIDs(platform_, type_, deviceCount_, devices_, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed"); + + ExtensionCheck(); + if (!extensionsAvailable) { + return; + } + + // extract function pointers for exported functions + INITPFN(clGetDeviceIDsFromD3D11KHR); + INITPFN(clCreateFromD3D11BufferKHR); + INITPFN(clCreateFromD3D11Texture2DKHR); + INITPFN(clCreateFromD3D11Texture3DKHR); + INITPFN(clEnqueueAcquireD3D11ObjectsKHR); + INITPFN(clEnqueueReleaseD3D11ObjectsKHR); + INITPFN(clGetPlaneFromImageAMD); + + char name[1024] = {0}; + size_t size = 0; + + if (deviceId >= deviceCount_) { + _errorFlag = true; + return; + } + + HRESULT hr = S_OK; + + UINT createDeviceFlags = 0; + + D3D_FEATURE_LEVEL featureLevels[] = { + (D3D_FEATURE_LEVEL)D3D_FEATURE_LEVEL_11_1, D3D_FEATURE_LEVEL_11_0 + + }; + D3D_FEATURE_LEVEL featureLevel; + // Create only the device, not the swapchain. We can't create the swapchain + // anyways without a handle to a window we explicitly own + hr = D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_HARDWARE, NULL, + createDeviceFlags, featureLevels, + _countof(featureLevels), D3D11_SDK_VERSION, + &dxD3D11Device, &featureLevel, &dxD3D11Context); + + if (FAILED(hr)) { + hr = D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_HARDWARE, NULL, + createDeviceFlags, featureLevels + 1, + _countof(featureLevels) - 1, D3D11_SDK_VERSION, + &dxD3D11Device, &featureLevel, &dxD3D11Context); + } + if (FAILED(hr)) { + hr = D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_SOFTWARE, NULL, + createDeviceFlags, featureLevels, + _countof(featureLevels), D3D11_SDK_VERSION, + &dxD3D11Device, &featureLevel, &dxD3D11Context); + } + + if (FAILED(hr)) { + hr = D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_SOFTWARE, NULL, + createDeviceFlags, featureLevels + 1, + _countof(featureLevels) - 1, D3D11_SDK_VERSION, + &dxD3D11Device, &featureLevel, &dxD3D11Context); + } + + cl_int status = 0; + cl_context_properties cps[7] = { + CL_CONTEXT_D3D11_DEVICE_KHR, + (cl_context_properties)(ID3D11Device*)dxD3D11Device, + CL_CONTEXT_INTEROP_USER_SYNC, + CL_FALSE, + CL_CONTEXT_PLATFORM, + (cl_context_properties)platform_, + 0}; + cl_context_properties* cprops = (NULL == platform_) ? NULL : cps; + + cl_uint deviceListSize = 0; + clGetDeviceIDsFromD3D11KHR(platform_, CL_D3D11_DEVICE_KHR, dxD3D11Device, + CL_PREFERRED_DEVICES_FOR_D3D11_KHR, 0, NULL, + &deviceListSize); + + std::vector devices; + devices.resize(deviceListSize); + clGetDeviceIDsFromD3D11KHR(platform_, CL_D3D11_DEVICE_KHR, dxD3D11Device, + CL_PREFERRED_DEVICES_FOR_D3D11_KHR, deviceListSize, + &devices[0], NULL); + + bool ret = false; + // Check that current device can be associated with OpenGL context + for (unsigned int i = 0; i < deviceListSize; i++) { + if (devices[i] == devices_[_deviceId]) { + ret = true; + break; + } + } + if (ret) { + char buf[2000]; + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, + sizeof(buf), buf, NULL); + + context_ = + clCreateContext(cprops, 1, &devices_[_deviceId], NULL, NULL, &status); + _queue = clCreateCommandQueue(context_, devices_[_deviceId], 0, &status); + } + CHECK_RESULT((ret != true), "Can't find D3D device!"); +} + +unsigned int OCLDX11Common::close(void) { + clReleaseCommandQueue(_queue); + unsigned int retVal = OCLTestImp::close(); + // deleteDXDevice(hDX_); + if (dxD3D11Context) dxD3D11Context->Release(); + if (dxD3D11Device) dxD3D11Device->Release(); + return retVal; +} diff --git a/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11Common.h b/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11Common.h new file mode 100644 index 0000000000..0897cd6ad4 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11Common.h @@ -0,0 +1,68 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_DX11_COMMON_H_ +#define _OCL_DX11_COMMON_H_ + +#include +#include + +#include "OCLTestImp.h" +#include "d3d11.h" + +typedef CL_API_ENTRY cl_mem(CL_API_CALL* clGetPlaneFromImageAMD_fn)( + cl_context /* context */, cl_mem /* mem */, cl_uint /* plane */, + cl_int* /* errcode_ret */); + +class OCLDX11Common : public OCLTestImp { + public: + // S/////////////////////////////////////// + // private initialization and clean-up // + ///////////////////////////////////////// + OCLDX11Common(); + virtual ~OCLDX11Common(); + /////////////////////// + // virtual interface // + /////////////////////// + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId); + virtual unsigned int close(void); + + protected: + bool extensionsAvailable; + + ID3D11Device* dxD3D11Device; + ID3D11DeviceContext* dxD3D11Context; + ID3D11Texture2D* dxDX11Texture; + cl_command_queue _queue; + + clGetDeviceIDsFromD3D11KHR_fn clGetDeviceIDsFromD3D11KHR; + clCreateFromD3D11BufferKHR_fn clCreateFromD3D11BufferKHR; + clCreateFromD3D11Texture2DKHR_fn clCreateFromD3D11Texture2DKHR; + clCreateFromD3D11Texture3DKHR_fn clCreateFromD3D11Texture3DKHR; + clEnqueueAcquireD3D11ObjectsKHR_fn clEnqueueAcquireD3D11ObjectsKHR; + clEnqueueReleaseD3D11ObjectsKHR_fn clEnqueueReleaseD3D11ObjectsKHR; + clGetPlaneFromImageAMD_fn clGetPlaneFromImageAMD; + + private: + void ExtensionCheck(); +}; + +#endif // _OCL_DX11_COMMON_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.cpp b/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.cpp new file mode 100644 index 0000000000..b9c156948c --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.cpp @@ -0,0 +1,478 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLDX11YUY2.h" + +#include +#include +#include +#include + +#define DXGI_FORMAT_NV12 103 +#define DXGI_FORMAT_P010 104 +#define GROUP_SIZE 256 + +const static char strKernel[] = + "__constant sampler_t imageSampler = CLK_NORMALIZED_COORDS_FALSE | " + "CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \n" + "__kernel void image2imageCopy( " + " \n" + " __read_only image2d_t input, " + " \n" + " __write_only image2d_t output) " + " \n" + "{ " + " \n" + " int2 coord = (int2)(get_global_id(0), get_global_id(1)); " + " \n" + " uint4 temp = read_imageui(input, imageSampler, coord); " + " \n" + " write_imageui(output, coord, temp); " + " \n" + "} " + " \n"; + +OCLDX11YUY2::OCLDX11YUY2() : OCLDX11Common() { + _numSubTests = 4; + blockSizeX = GROUP_SIZE; + blockSizeY = 1; +} + +OCLDX11YUY2::~OCLDX11YUY2() {} + +void OCLDX11YUY2::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + dxDX11Texture = 0; + clImage2DOut = 0; + _openTest = test; + // Initialize random number seed + srand((unsigned int)time(NULL)); + + OCLDX11Common::open(test, units, conversion, deviceId); + if (_errorFlag) return; + if (!extensionsAvailable) { + return; + } + + if (_openTest < 2) { + dxFormat = (DXGI_FORMAT)DXGI_FORMAT_NV12; + extensionsAvailable = formatSupported(); + if (!extensionsAvailable) { + printf("DXGI_FORMAT_NV12 is required for this test!\n"); + return; + } + } else { + dxFormat = (DXGI_FORMAT)DXGI_FORMAT_P010; + extensionsAvailable = formatSupported(); + if (!extensionsAvailable) { + printf("DXGI_FORMAT_P010 is required for this test!\n"); + return; + } + } + + CompileKernel(); + AllocateOpenCLImage(); +} + +void OCLDX11YUY2::run(void) { + if (_errorFlag) return; + if (!extensionsAvailable) return; + + D3D11_TEXTURE2D_DESC Desc = {0}; + + Desc.ArraySize = 1; + Desc.BindFlags = 0; + Desc.Format = dxFormat; + Desc.Width = OCLDX11YUY2::WIDTH; + Desc.Height = OCLDX11YUY2::HEIGHT; + Desc.MipLevels = 1; + Desc.SampleDesc.Count = 1; + // Desc.MiscFlags=D3D11_RESOURCE_MISC_SHARED; //MM for fast GPU interop + // MM: these flags are incompatible with D3D11_RESOURCE_MISC_SHARED + // now we allocate texture without CPU access and if needed use temp texture + // (see FromSystemToDX11 and FromDX11ToSystem) + + Desc.Usage = D3D11_USAGE_STAGING; + Desc.BindFlags = 0; + Desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE | D3D11_CPU_ACCESS_READ; + + ID3D11Texture2D *pTextureTmp; + HRESULT hr = dxD3D11Device->CreateTexture2D(&Desc, NULL, &pTextureTmp); + + // fill memory + D3D11_MAPPED_SUBRESOURCE LockedRectD11; + if (SUCCEEDED(hr)) { + hr = + dxD3D11Context->Map(pTextureTmp, 0, D3D11_MAP_WRITE, 0, &LockedRectD11); + } + if (SUCCEEDED(hr)) { + // fill memory with something + for (int y = 0; y < OCLDX11YUY2::HEIGHT; y++) { + BYTE *pLine = (BYTE *)LockedRectD11.pData + y * LockedRectD11.RowPitch; + + BYTE *pLineUV = (BYTE *)LockedRectD11.pData + y * LockedRectD11.RowPitch + + OCLDX11YUY2::HEIGHT * LockedRectD11.RowPitch; + + for (int x = 0; x < OCLDX11YUY2::WIDTH; x++) { + *pLine++ = 0x7F; // Y + if (y < OCLDX11YUY2::HEIGHT / 2 && x < OCLDX11YUY2::WIDTH / 2) { + *pLineUV++ = 0x1F; // U + *pLineUV++ = 0x2F; // V + } + } + } + + dxD3D11Context->Unmap(pTextureTmp, 0); + } + Desc.BindFlags = D3D11_BIND_RENDER_TARGET | D3D11_BIND_SHADER_RESOURCE; + Desc.Usage = D3D11_USAGE_DEFAULT; + Desc.CPUAccessFlags = 0; + Desc.MiscFlags = (_openTest == 0) + ? 0 + : D3D11_RESOURCE_MISC_SHARED; // MM for fast GPU interop + + hr = dxD3D11Device->CreateTexture2D(&Desc, NULL, &dxDX11Texture); + + if (pTextureTmp != NULL) { + dxD3D11Context->CopySubresourceRegion(dxDX11Texture, 0, 0, 0, 0, + pTextureTmp, 0, NULL); + pTextureTmp->Release(); + } + testInterop(); +} + +void OCLDX11YUY2::AllocateOpenCLImage() { + cl_int status = 0; + + cl_image_format format{}; + format.image_channel_order = CL_R; + format.image_channel_data_type = + (dxFormat == DXGI_FORMAT_NV12) ? CL_UNSIGNED_INT8 : CL_UNSIGNED_INT16; + cl_image_desc descr{}; + descr.image_type = CL_MEM_OBJECT_IMAGE2D; + descr.image_width = WIDTH; + descr.image_height = HEIGHT + HEIGHT / 2; + + clImage2DOut = clCreateImage(context_, CL_MEM_WRITE_ONLY, &format, &descr, + NULL, &status); + CHECK_RESULT((status != CL_SUCCESS), "AllocateOpenCLImage() failed"); +} + +void OCLDX11YUY2::testInterop() { + // alloc + cl_int clStatus = 0; + cl_mem clImage2D = + clCreateFromD3D11Texture2DKHR(context_, 0, dxDX11Texture, 0, &clStatus); + CHECK_RESULT((clStatus != CL_SUCCESS), + "clCreateFromD3D11Texture2DKHR() failed"); + + // bring objects to the queue + cl_event clEvent = NULL; + clEnqueueAcquireD3D11ObjectsKHR(_queue, 1, &clImage2D, 0, NULL, &clEvent); + clStatus = clWaitForEvents(1, &clEvent); + clReleaseEvent(clEvent); + + CopyOpenCLImage(clImage2D); + bool ImageReadWorks = CheckCLImage(clImage2D); + bool bKernelWorks = CheckCLImage(clImage2DOut); + CHECK_RESULT_NO_RETURN((ImageReadWorks != true), + "CheckCLImage(clImage2D) failed"); + CHECK_RESULT_NO_RETURN((bKernelWorks != true), + "CheckCLImage(clImage2DOut) failed"); + + cl_mem planeY = clGetPlaneFromImageAMD(context_, clImage2D, 0, &clStatus); + CHECK_RESULT((clStatus != CL_SUCCESS), + "clGetPlaneFromImageAMD(context_,clImage2D,0,&clStatus) failed"); + + cl_mem planeUV = clGetPlaneFromImageAMD(context_, clImage2D, 1, &clStatus); + CHECK_RESULT((clStatus != CL_SUCCESS), + "clGetPlaneFromImageAMD(context_,clImage2D,1,&clStatus) failed"); + + bool ImageWorksY = CheckCLImageY(planeY); + bool ImageWorksUV = CheckCLImageUV(planeUV); + + clReleaseMemObject(planeY); + clReleaseMemObject(planeUV); + + // release + clEvent = NULL; + // release object from the queue + clStatus = + clEnqueueReleaseD3D11ObjectsKHR(_queue, 1, &clImage2D, 0, NULL, &clEvent); + clStatus = clWaitForEvents(1, &clEvent); + clReleaseEvent(clEvent); + + // release mem object + clReleaseMemObject(clImage2D); + + CHECK_RESULT_NO_RETURN((ImageWorksY != true), "CheckCLImageY() failed"); + CHECK_RESULT_NO_RETURN((ImageWorksUV != true), "CheckCLImageUV() failed"); +} + +unsigned int OCLDX11YUY2::close(void) { + if (clImage2DOut) clReleaseMemObject(clImage2DOut); + if (dxDX11Texture) dxDX11Texture->Release(); + return OCLDX11Common::close(); +} + +bool OCLDX11YUY2::CheckCLImage(cl_mem clImage) { + cl_int clStatus = 0; + + size_t pitch = 0; + clStatus = + clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(pitch), &pitch, NULL); + pitch *= 2; + + cl_image_format format; + clStatus = + clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(format), &format, NULL); + + size_t height; + clStatus = + clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL); + + CHECK_RESULT_NO_RETURN(height != (HEIGHT + HEIGHT / 2), + "CheckCLImage: height!=(HEIGHT+HEIGHT/2)"); + + char *pTempBuffer = new char[(HEIGHT + HEIGHT / 2) * pitch]; + + size_t origin[] = {0, 0, 0}; + size_t region[] = {WIDTH, HEIGHT + HEIGHT / 2, 1}; + clStatus = clEnqueueReadImage(_queue, clImage, 1, origin, region, pitch, 0, + pTempBuffer, 0, 0, 0); + + ::clFinish(_queue); + + // test + + bool bBreak = false; + for (int y = 0; y < HEIGHT && !bBreak; y++) { + char *pLine = (char *)pTempBuffer + y * pitch; + char *pLineUV = (char *)pTempBuffer + y * pitch + HEIGHT * pitch; + + for (int x = 0; x < WIDTH; x++) { + if (*pLine != 0x7F) // Y + { + bBreak = true; + break; + } + pLine++; + if (y < HEIGHT / 2 && x < WIDTH / 2) { + if (*pLineUV != 0x1F) // U + { + bBreak = true; + break; + } + pLineUV++; + if (*pLineUV != 0x2F) // V + { + bBreak = true; + break; + } + pLineUV++; + } + } + } + delete[] pTempBuffer; + + return !bBreak; +} + +bool OCLDX11YUY2::CheckCLImageY(cl_mem clImage) { + cl_int clStatus = 0; + + size_t pitch = 0; + clStatus = + clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(pitch), &pitch, NULL); + pitch *= 2; + + cl_image_format format; + clStatus = + clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(format), &format, NULL); + + size_t height; + clStatus = + clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL); + + CHECK_RESULT_NO_RETURN(height != HEIGHT, "CheckCLImageY: height!=HEIGHT"); + + char *pTempBuffer = new char[HEIGHT * pitch]; + + size_t origin[] = {0, 0, 0}; + size_t region[] = {WIDTH, HEIGHT, 1}; + clStatus = clEnqueueReadImage(_queue, clImage, 1, origin, region, pitch, 0, + pTempBuffer, 0, 0, 0); + + ::clFinish(_queue); + + // test + + bool bBreak = false; + for (int y = 0; y < HEIGHT && !bBreak; y++) { + char *pLine = (char *)pTempBuffer + y * pitch; + for (int x = 0; x < WIDTH; x++) { + if (*pLine != 0x7F) // Y + { + bBreak = true; + break; + } + pLine++; + } + } + + delete[] pTempBuffer; + + return !bBreak; +} + +bool OCLDX11YUY2::CheckCLImageUV(cl_mem clImage) { + cl_int clStatus = 0; + + size_t pitch = 0; + clStatus = + clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(pitch), &pitch, NULL); + pitch *= 2; + size_t width = 0; + clStatus = + clGetImageInfo(clImage, CL_IMAGE_WIDTH, sizeof(width), &width, NULL); + + cl_image_format format; + clStatus = + clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(format), &format, NULL); + + size_t height; + clStatus = + clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL); + + CHECK_RESULT_NO_RETURN(height != HEIGHT / 2, + "CheckCLImageUV: height!=HEIGHT/2"); + + char *pTempBuffer = new char[(HEIGHT / 2) * pitch]; + + size_t origin[] = {0, 0, 0}; + size_t region[] = {WIDTH / 2, HEIGHT / 2, 1}; + clStatus = clEnqueueReadImage(_queue, clImage, 1, origin, region, pitch, 0, + pTempBuffer, 0, 0, 0); + + ::clFinish(_queue); + + bool bBreak = false; + for (int y = 0; y < HEIGHT / 2 && !bBreak; y++) { + char *pLineUV = (char *)pTempBuffer + y * pitch; + for (int x = 0; x < WIDTH / 2; x++) { + if (*pLineUV != 0x1F) // U + { + bBreak = true; + break; + } + pLineUV++; + if (*pLineUV != 0x2F) // V + { + bBreak = true; + break; + } + pLineUV++; + } + } + delete[] pTempBuffer; + + return !bBreak; +} + +void OCLDX11YUY2::CopyOpenCLImage(cl_mem clImageSrc) { + cl_int status = 0; + + // Set appropriate arguments to the kernel2D + + // input buffer image + status = clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clImageSrc); + CHECK_RESULT((status != CL_SUCCESS), + "CopyOpenCLImage() failed at " + "clSetKernelArg(kernel_,0,sizeof(cl_mem),&clImageSrc)"); + status = clSetKernelArg(kernel_, 1, sizeof(cl_mem), &clImage2DOut); + CHECK_RESULT((status != CL_SUCCESS), + "CopyOpenCLImage() failed at " + "clSetKernelArg(kernel_,1,sizeof(cl_mem),&clImage2DOut)"); + + // Enqueue a kernel run call. + size_t global_work_offset[] = {0, 0}; + size_t globalThreads[] = {WIDTH, HEIGHT + HEIGHT / 2}; + size_t localThreads[] = {blockSizeX, blockSizeY}; + + // status = + // clEnqueueNDRangeKernel(_queue,kernel_,2,NULL,globalThreads,localThreads,0,NULL,0); + status = clEnqueueNDRangeKernel(_queue, kernel_, 2, NULL, globalThreads, NULL, + 0, NULL, 0); + CHECK_RESULT((status != CL_SUCCESS), + "CopyOpenCLImage() failed at clEnqueueNDRangeKernel"); + + status = clFinish(_queue); + CHECK_RESULT((status != CL_SUCCESS), "CopyOpenCLImage() failed at clFinish"); +} + +void OCLDX11YUY2::CompileKernel() { + cl_int status = 0; + + size_t kernelSize = sizeof(strKernel); + const char *strs = (const char *)&strKernel[0]; + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strs, + &kernelSize, &status); + + status = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL, + NULL, NULL); + if (status != CL_SUCCESS) { + if (status == CL_BUILD_PROGRAM_FAILURE) { + cl_int logStatus; + size_t buildLogSize = 0; + logStatus = clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, buildLogSize, + NULL, &buildLogSize); + std::string buildLog; + buildLog.resize(buildLogSize); + + logStatus = clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, buildLogSize, + &buildLog[0], NULL); + printf("%s", buildLog.c_str()); + } + return; + } + // get a kernel object handle for a kernel with the given name + kernel_ = _wrapper->clCreateKernel(program_, "image2imageCopy", &status); + + size_t kernel2DWorkGroupSize = 0; + status = clGetKernelWorkGroupInfo(kernel_, devices_[_deviceId], + CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), + &kernel2DWorkGroupSize, 0); + + if ((blockSizeX * blockSizeY) > kernel2DWorkGroupSize) { + if (blockSizeX > kernel2DWorkGroupSize) { + blockSizeX = kernel2DWorkGroupSize; + blockSizeY = 1; + } + } +} + +bool OCLDX11YUY2::formatSupported() { + UINT supported = 0u; + dxD3D11Device->CheckFormatSupport(dxFormat, (UINT *)&supported); + return supported & D3D11_FORMAT_SUPPORT_TEXTURE2D; +} diff --git a/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.h b/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.h new file mode 100644 index 0000000000..b8797fbeb5 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/dx/OCLDX11YUY2.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_DX11_YUY2_H_ +#define _OCL_DX11_YUY2_H_ + +#include "OCLDX11Common.h" + +class OCLDX11YUY2 : public OCLDX11Common { + public: + OCLDX11YUY2(); + virtual ~OCLDX11YUY2(); + + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId); + virtual void run(void); + virtual unsigned int close(void); + + protected: + static const unsigned int WIDTH = 1280; + static const unsigned int HEIGHT = 720; + + void testInterop(); + void AllocateOpenCLImage(); + bool CheckCLImage(cl_mem clImage); + bool CheckCLImageY(cl_mem clImage); + bool CheckCLImageUV(cl_mem clImage); + void CopyOpenCLImage(cl_mem clImageSrc); + void CompileKernel(); + bool formatSupported(); + void testFormat(); + + size_t blockSizeX; /**< Work-group size in x-direction */ + size_t blockSizeY; /**< Work-group size in y-direction */ + cl_mem clImage2DOut; + DXGI_FORMAT dxFormat; +}; + +#endif // _OCL_DX11_YUY2_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/dx/TestList.cpp b/projects/clr/opencl/tests/ocltst/module/dx/TestList.cpp new file mode 100644 index 0000000000..534d3f541c --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/dx/TestList.cpp @@ -0,0 +1,52 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLTestListImp.h" + +// +// Includes for tests +// +#ifdef ATI_OS_WIN +#include "OCLDX11YUY2.h" +#endif + +// +// Helper macro for adding tests +// +template +static void* dictionary_CreateTestFunc(void) { + return new T(); +} + +#define TEST(name) \ + { #name, &dictionary_CreateTestFunc < name> } + +#ifdef ATI_OS_WIN + +TestEntry TestList[] = {TEST(OCLDX11YUY2)}; + +unsigned int TestListCount = sizeof(TestList) / sizeof(TestList[0]); +#else +TestEntry TestList[] = {{"void", 0}}; +unsigned int TestListCount = 0; + +#endif +unsigned int TestLibVersion = 0; +const char* TestLibName = "ocldx"; diff --git a/projects/clr/opencl/tests/ocltst/module/dx/ocldx.exclude b/projects/clr/opencl/tests/ocltst/module/dx/ocldx.exclude new file mode 100644 index 0000000000..39345e8fd7 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/dx/ocldx.exclude @@ -0,0 +1 @@ +# all clear diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBuffer.cpp b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBuffer.cpp new file mode 100644 index 0000000000..fe94e49fd0 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBuffer.cpp @@ -0,0 +1,220 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLGLBuffer.h" + +#include +#include +#include +#include + +const static char* strKernel = + "__kernel void glbuffer_test( __global uint4 *source, __global uint4 " + "*glDest, __global uint4 *clDest) \n" + "{ " + " \n" + " int tid = get_global_id(0); " + " \n" + " clDest[ tid ] = source[ tid ] + (uint4)(1); " + " \n" + " glDest[ tid ] = source[ tid ] + (uint4)(2); " + " \n" + "} " + " \n"; + +OCLGLBuffer::OCLGLBuffer() : inGLBuffer_(0), outGLBuffer_(0) { + _numSubTests = 1; +} + +OCLGLBuffer::~OCLGLBuffer() {} + +void OCLGLBuffer::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + // Initialize random number seed + srand((unsigned int)time(NULL)); + + OCLGLCommon::open(test, units, conversion, deviceId); + if (_errorFlag) return; + + // Build the kernel + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateProgramWithSource() failed (%d)", error_); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_); + + kernel_ = _wrapper->clCreateKernel(program_, "glbuffer_test", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_); +} + +void OCLGLBuffer::run(void) { + if (_errorFlag) { + return; + } + + cl_mem buffer; + cl_uint4 inData[c_numOfElements] = {{{0}}}; + cl_uint4 outDataCL[c_numOfElements] = {{{0}}}; + cl_uint4 outDataGL[c_numOfElements] = {{{0}}}; + + // Initialize input data with random values + for (unsigned int i = 0; i < c_numOfElements; i++) { + for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) { + inData[i].s[j] = (unsigned int)rand(); + } + } + + // Generate and Bind in & out OpenGL buffers + glGenBuffers(1, &inGLBuffer_); + glGenBuffers(1, &outGLBuffer_); + + glBindBuffer(GL_ARRAY_BUFFER, inGLBuffer_); + glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), inData, + GL_STATIC_DRAW); + + glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer_); + glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), outDataGL, + GL_STATIC_DRAW); + + glBindBuffer(GL_ARRAY_BUFFER, 0); + glFinish(); + + // Create input buffer from GL input buffer + buffer = _wrapper->clCreateFromGLBuffer(context_, CL_MEM_READ_ONLY, + inGLBuffer_, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "Unable to create input GL buffer (%d)", + error_); + buffers_.push_back(buffer); + + // Create output buffer from GL output buffer + buffer = _wrapper->clCreateFromGLBuffer(context_, CL_MEM_WRITE_ONLY, + outGLBuffer_, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "Unable to create output GL buffer (%d)", + error_); + buffers_.push_back(buffer); + + // Create a CL output buffer + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + c_numOfElements * sizeof(cl_uint4), NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed (%d)", error_); + buffers_.push_back(buffer); + + // Assign args and execute + for (unsigned int i = 0; i < buffers_.size(); i++) { + error_ = + _wrapper->clSetKernelArg(kernel_, i, sizeof(cl_mem), &buffers()[i]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)", + error_); + } + + error_ = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 2, + &buffers()[0], 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)", + error_); + + size_t gws[1] = {c_numOfElements}; + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed (%d)", + error_); + + error_ = _wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 2, + &buffers()[0], 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReleaseGLObjects failed (%d)", + error_); + + error_ = _wrapper->clFinish(cmdQueues_[_deviceId]); + CHECK_RESULT((error_ != CL_SUCCESS), "clFinish() failed (%d)", error_); + + // Get the results from both CL and GL buffers + error_ = _wrapper->clEnqueueReadBuffer( + cmdQueues_[_deviceId], buffers()[2], CL_TRUE, 0, + c_numOfElements * sizeof(cl_uint4), outDataCL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "Unable to read output CL array! (%d)", + error_); + + glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer_); + void* glMem = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY); + memcpy(outDataGL, glMem, c_numOfElements * sizeof(cl_uint4)); + glUnmapBuffer(GL_ARRAY_BUFFER); + + cl_uint4 expectedCL = {{0}}; + cl_uint4 expectedGL = {{0}}; + + // Check output + for (unsigned int i = 0; i < c_numOfElements; ++i) { + // Calculate expected value in CL output buffer (input + 1) + expectedCL = inData[i]; + expectedCL.s[0]++; + expectedCL.s[1]++; + expectedCL.s[2]++; + expectedCL.s[3]++; + + // Calculate expected value in GL output buffer (input + 2) + expectedGL = inData[i]; + expectedGL.s[0] += 2; + expectedGL.s[1] += 2; + expectedGL.s[2] += 2; + expectedGL.s[3] += 2; + + // Compare expected output with actual data received + for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) { + CHECK_RESULT((outDataCL[i].s[j] != expectedCL.s[j]), + "Element %d in CL output buffer is incorrect!\n\t \ + expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}", + i, expectedCL.s[0], expectedCL.s[1], expectedCL.s[2], + expectedCL.s[3], outDataCL[i].s[0], outDataCL[i].s[1], + outDataCL[i].s[2], outDataCL[i].s[3]); + CHECK_RESULT((outDataGL[i].s[j] != expectedGL.s[j]), + "Element %d in GL output buffer is incorrect!\n\t \ + expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}", + i, expectedGL.s[0], expectedGL.s[1], expectedGL.s[2], + expectedGL.s[3], outDataGL[i].s[0], outDataGL[i].s[1], + outDataGL[i].s[2], outDataGL[i].s[3]); + } + } +} + +unsigned int OCLGLBuffer::close(void) { + for (unsigned int i = 0; i < buffers().size(); ++i) { + clReleaseMemObject(buffers()[i]); + } + buffers_.clear(); + + // Delete GL in & out buffers + glBindBuffer(GL_ARRAY_BUFFER, 0); + glDeleteBuffers(1, &inGLBuffer_); + inGLBuffer_ = 0; + glDeleteBuffers(1, &outGLBuffer_); + outGLBuffer_ = 0; + + return OCLGLCommon::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBuffer.h b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBuffer.h new file mode 100644 index 0000000000..937acb61b1 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBuffer.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_GL_BUFFER_H_ +#define _OCL_GL_BUFFER_H_ + +#include "OCLGLCommon.h" + +class OCLGLBuffer : public OCLGLCommon { + public: + OCLGLBuffer(); + virtual ~OCLGLBuffer(); + + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId); + virtual void run(void); + virtual unsigned int close(void); + + private: + static const unsigned int c_numOfElements = 1024; + GLuint inGLBuffer_; + GLuint outGLBuffer_; +}; + +#endif // _OCL_GL_BUFFER_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBufferMultipleQueues.cpp b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBufferMultipleQueues.cpp new file mode 100644 index 0000000000..dfff6262a1 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBufferMultipleQueues.cpp @@ -0,0 +1,303 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLGLBufferMultipleQueues.h" + +#include +#include +#include +#include + +const static char* strKernel = + "__kernel void glbuffer_test( __global uint4 *source, __global uint4 " + "*glDest, __global uint4 *clDest) \n" + "{ " + " \n" + " int tid = get_global_id(0); " + " \n" + " glDest[ tid ] = source[ tid ] + (uint4)(2); " + " \n" + " clDest[ tid ] = source[ tid ] + (uint4)(1); " + " \n" + "} " + " \n"; + +OCLGLBufferMultipleQueues::OCLGLBufferMultipleQueues() { _numSubTests = 1; } + +OCLGLBufferMultipleQueues::~OCLGLBufferMultipleQueues() {} + +void OCLGLBufferMultipleQueues::open(unsigned int test, char* units, + double& conversion, + unsigned int deviceId) { + // Initialize random number seed + srand((unsigned int)time(NULL)); + + OCLGLCommon::open(test, units, conversion, deviceId); + if (_errorFlag) return; + + // Create multiple queues for the device (first add already created queue in + // OCLGLCommon::open, then add a second queue) + deviceCmdQueues_.resize(QUEUES_PER_DEVICE_COUNT); + deviceCmdQueues_[0] = cmdQueues_[deviceId]; + for (int queueIndex = 1; queueIndex < QUEUES_PER_DEVICE_COUNT; queueIndex++) { + cl_command_queue cmdQueue = _wrapper->clCreateCommandQueue( + context_, devices_[deviceId], 0, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed"); + deviceCmdQueues_[queueIndex] = cmdQueue; + } + + // Build the kernel + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateProgramWithSource() failed (%d)", error_); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_); + + kernel_ = _wrapper->clCreateKernel(program_, "glbuffer_test", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_); +} + +void OCLGLBufferMultipleQueues::run(void) { + if (_errorFlag) { + return; + } + + inputGLBufferPerQueue_.resize(QUEUES_PER_DEVICE_COUNT, NULL); + outputGLBufferPerQueue_.resize(QUEUES_PER_DEVICE_COUNT, NULL); + outputCLBufferPerQueue_.resize(QUEUES_PER_DEVICE_COUNT, NULL); + + std::vector > inData( + QUEUES_PER_DEVICE_COUNT); // Input data per queue + + inGLBufferIDs_.resize(QUEUES_PER_DEVICE_COUNT, 0); + outGLBufferIDs_.resize(QUEUES_PER_DEVICE_COUNT, 0); + for (int queueIndex = 0; queueIndex < QUEUES_PER_DEVICE_COUNT; queueIndex++) { + // Initialize input data with random values + inData[queueIndex].resize(BUFFER_ELEMENTS_COUNT); + for (int i = 0; i < BUFFER_ELEMENTS_COUNT; i++) { + for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) { + inData[queueIndex][i].s[j] = (unsigned int)rand(); + } + } + + // Generate and Bind in & out OpenGL buffers + glGenBuffers(1, &inGLBufferIDs_[queueIndex]); + glGenBuffers(1, &outGLBufferIDs_[queueIndex]); + + glBindBuffer(GL_ARRAY_BUFFER, inGLBufferIDs_[queueIndex]); + glBufferData(GL_ARRAY_BUFFER, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4), + &inData[queueIndex][0], GL_STATIC_DRAW); + + glBindBuffer(GL_ARRAY_BUFFER, outGLBufferIDs_[queueIndex]); + glBufferData(GL_ARRAY_BUFFER, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4), + NULL, GL_STATIC_DRAW); + + glBindBuffer(GL_ARRAY_BUFFER, 0); + glFinish(); + + // Create input buffer from GL input buffer + inputGLBufferPerQueue_[queueIndex] = _wrapper->clCreateFromGLBuffer( + context_, CL_MEM_READ_ONLY, inGLBufferIDs_[queueIndex], &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "Unable to create input GL buffer (%d)", error_); + + // Create output buffer from GL output buffer + outputGLBufferPerQueue_[queueIndex] = _wrapper->clCreateFromGLBuffer( + context_, CL_MEM_WRITE_ONLY, outGLBufferIDs_[queueIndex], &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "Unable to create output GL buffer (%d)", error_); + + // Create a CL output buffer + outputCLBufferPerQueue_[queueIndex] = _wrapper->clCreateBuffer( + context_, CL_MEM_WRITE_ONLY, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4), + NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed (%d)", + error_); + } + + for (int queueIndex = 0; queueIndex < QUEUES_PER_DEVICE_COUNT; queueIndex++) { + // Assign arguments to kernel according to queue index + error_ = _wrapper->clSetKernelArg( + kernel_, 0, sizeof(cl_mem), + &inputGLBufferPerQueue_[queueIndex]); // Input source + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)", + error_); + error_ = _wrapper->clSetKernelArg( + kernel_, 1, sizeof(cl_mem), + &outputGLBufferPerQueue_[queueIndex]); // Output glDest + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)", + error_); + error_ = _wrapper->clSetKernelArg( + kernel_, 2, sizeof(cl_mem), + &outputCLBufferPerQueue_[queueIndex]); // Output clDest + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)", + error_); + + // Acquire input GL buffer + error_ = _wrapper->clEnqueueAcquireGLObjects( + deviceCmdQueues_[queueIndex], 1, &inputGLBufferPerQueue_[queueIndex], 0, + NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)", + error_); + + // Acquire output GL buffer + error_ = _wrapper->clEnqueueAcquireGLObjects( + deviceCmdQueues_[queueIndex], 1, &outputGLBufferPerQueue_[queueIndex], + 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)", + error_); + + // Enqueue the kernel + size_t gws[1] = {BUFFER_ELEMENTS_COUNT}; + error_ = + _wrapper->clEnqueueNDRangeKernel(deviceCmdQueues_[queueIndex], kernel_, + 1, NULL, gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed (%d)", + error_); + + // Release input GL buffer + error_ = _wrapper->clEnqueueReleaseGLObjects( + deviceCmdQueues_[queueIndex], 1, &inputGLBufferPerQueue_[queueIndex], 0, + NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), + "clEnqueueReleaseGLObjects failed (%d)", error_); + + // Release output GL buffer + error_ = _wrapper->clEnqueueReleaseGLObjects( + deviceCmdQueues_[queueIndex], 1, &outputGLBufferPerQueue_[queueIndex], + 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), + "clEnqueueReleaseGLObjects failed (%d)", error_); + + // Flush commands in order to trigger the operations + error_ = _wrapper->clFlush(deviceCmdQueues_[queueIndex]); + CHECK_RESULT((error_ != CL_SUCCESS), "clFlush() failed (%d)", error_); + } + + for (int queueIndex = 0; queueIndex < QUEUES_PER_DEVICE_COUNT; queueIndex++) { + // Get the results from CL buffer (in a synchronous manner) + cl_uint4 outDataCL[BUFFER_ELEMENTS_COUNT]; + error_ = _wrapper->clEnqueueReadBuffer( + deviceCmdQueues_[queueIndex], outputCLBufferPerQueue_[queueIndex], + CL_TRUE, 0, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4), outDataCL, 0, + NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "Unable to read output CL array! (%d)", + error_); + + cl_uint4 outDataGL[BUFFER_ELEMENTS_COUNT] = {{{0}}}; + glBindBuffer(GL_ARRAY_BUFFER, outGLBufferIDs_[queueIndex]); // why again + void* glMem = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY); + memcpy(outDataGL, glMem, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4)); + glUnmapBuffer(GL_ARRAY_BUFFER); + + cl_uint4 expectedCL = {{0}}; + cl_uint4 expectedGL = {{0}}; + + // Check output + for (int i = 0; i < BUFFER_ELEMENTS_COUNT; ++i) { + // Calculate expected value in CL output buffer (input + 1) + expectedCL = inData[queueIndex][i]; + expectedCL.s[0]++; + expectedCL.s[1]++; + expectedCL.s[2]++; + expectedCL.s[3]++; + + // Calculate expected value in GL output buffer (input + 2) + expectedGL = inData[queueIndex][i]; + expectedGL.s[0] += 2; + expectedGL.s[1] += 2; + expectedGL.s[2] += 2; + expectedGL.s[3] += 2; + + // Compare expected output with actual data received + for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) { + CHECK_RESULT((outDataCL[i].s[j] != expectedCL.s[j]), + "Element %d in CL output buffer is incorrect!\n\t \ + expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}", + i, expectedCL.s[0], expectedCL.s[1], expectedCL.s[2], + expectedCL.s[3], outDataCL[i].s[0], outDataCL[i].s[1], + outDataCL[i].s[2], outDataCL[i].s[3]); + CHECK_RESULT((outDataGL[i].s[j] != expectedGL.s[j]), + "Element %d in GL output buffer is incorrect!\n\t \ + expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}", + i, expectedGL.s[0], expectedGL.s[1], expectedGL.s[2], + expectedGL.s[3], outDataGL[i].s[0], outDataGL[i].s[1], + outDataGL[i].s[2], outDataGL[i].s[3]); + } + } + } +} + +unsigned int OCLGLBufferMultipleQueues::close(void) { + // Release cl buffers (must be done before releasing the associated GL + // buffers) + for (int bufferIndex = 0; bufferIndex < (int)inputGLBufferPerQueue_.size(); + bufferIndex++) { + error_ = _wrapper->clReleaseMemObject(inputGLBufferPerQueue_[bufferIndex]); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), + "clReleaseMemObject() failed"); + } + + for (int bufferIndex = 0; bufferIndex < (int)outputGLBufferPerQueue_.size(); + bufferIndex++) { + error_ = _wrapper->clReleaseMemObject(outputGLBufferPerQueue_[bufferIndex]); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), + "clReleaseMemObject() failed"); + } + + for (int bufferIndex = 0; bufferIndex < (int)outputCLBufferPerQueue_.size(); + bufferIndex++) { + error_ = _wrapper->clReleaseMemObject(outputCLBufferPerQueue_[bufferIndex]); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), + "clReleaseMemObject() failed"); + } + + // Delete GL in & out buffers + glBindBuffer(GL_ARRAY_BUFFER, 0); + if (!inGLBufferIDs_.empty()) { + glDeleteBuffers((int)inGLBufferIDs_.size(), &inGLBufferIDs_[0]); + } + + if (!outGLBufferIDs_.empty()) { + glDeleteBuffers((int)outGLBufferIDs_.size(), &outGLBufferIDs_[0]); + } + + // Release queues created by open method, the first queue per device is + // released by base class + for (int queueIndex = 1; queueIndex < (int)deviceCmdQueues_.size(); + queueIndex++) { + error_ = _wrapper->clReleaseCommandQueue(deviceCmdQueues_[queueIndex]); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), + "clReleaseCommandQueue() failed"); + } + deviceCmdQueues_.clear(); + + return OCLGLCommon::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBufferMultipleQueues.h b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBufferMultipleQueues.h new file mode 100644 index 0000000000..97a65e17ee --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLBufferMultipleQueues.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_GL_BUFFER_MULTIPLE_QUEUES_H_ +#define _OCL_GL_BUFFER_MULTIPLE_QUEUES_H_ + +#include "OCLGLCommon.h" + +class OCLGLBufferMultipleQueues : public OCLGLCommon { + public: + OCLGLBufferMultipleQueues(); + virtual ~OCLGLBufferMultipleQueues(); + + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId); + virtual void run(void); + virtual unsigned int close(void); + + private: + static const int BUFFER_ELEMENTS_COUNT = 1024; + static const int QUEUES_PER_DEVICE_COUNT = 2; + std::vector + deviceCmdQueues_; // Multiple queues per device (single device) + std::vector inputGLBufferPerQueue_; // Input GL buffer per queue + std::vector outputGLBufferPerQueue_; // Output GL buffer per queue + std::vector outputCLBufferPerQueue_; // Input CL buffer per queue + std::vector inGLBufferIDs_; // Input GL buffers IDs + std::vector outGLBufferIDs_; // Output GL buffers IDs +}; + +#endif // _OCL_GL_BUFFER_MULTIPLE_QUEUES_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthBuffer.cpp b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthBuffer.cpp new file mode 100644 index 0000000000..14a441c80a --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthBuffer.cpp @@ -0,0 +1,270 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLGLDepthBuffer.h" + +#include +#include +#include +#include + +const static char* strKernel = + "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" + "__kernel void gldepths_test( __global float *output, read_only image2d_t " + "source, sampler_t sampler){ \n" + " int tidX = get_global_id(0);\n" + " int tidY = get_global_id(1);\n" + " float4 value = read_imagef( source, sampler, (int2)( tidX, tidY ) );\n" + " output[ tidY * get_image_width( source ) + tidX ] = value.z;\n" + "}\n"; + +OCLGLDepthBuffer::OCLGLDepthBuffer() + : glDepthBuffer_(0), + frameBufferOBJ_(0), + colorBuffer_(0), + clOutputBuffer_(0), + clDepth_(0), + clSampler_(0), + pGLOutput_(0), + pCLOutput_(0), + extensionSupported_(false) { + _numSubTests = 2; + _currentTest = 0; +} + +OCLGLDepthBuffer::~OCLGLDepthBuffer() {} + +void OCLGLDepthBuffer::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLGLCommon::open(test, units, conversion, deviceId); + if (_errorFlag) return; + + char* pExtensions = (char*)malloc(8192); + size_t returnSize; + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 8192, + pExtensions, &returnSize); + + // if extension if not supported + if (!strstr(pExtensions, "cl_khr_gl_depth_images")) { + printf("skipping test depth interop not supported\n"); + free(pExtensions); + return; + } + free(pExtensions); + extensionSupported_ = true; + + _currentTest = test; + + // Build the kernel + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateProgramWithSource() failed (%d)", error_); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_); + + kernel_ = _wrapper->clCreateKernel(program_, "gldepths_test", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_); +} + +void OCLGLDepthBuffer::run(void) { + if (_errorFlag || !extensionSupported_) { + return; + } + bool retVal; + switch (_currentTest) { + case 0: + retVal = testDepthRead(GL_DEPTH_COMPONENT32F, GL_DEPTH_ATTACHMENT); + break; + case 1: + retVal = testDepthRead(GL_DEPTH_COMPONENT16, GL_DEPTH_ATTACHMENT); + break; + case 2: + retVal = testDepthRead(GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL_ATTACHMENT); + break; + case 3: + retVal = testDepthRead(GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL_ATTACHMENT); + break; + default: + CHECK_RESULT(true, "unsupported test number\n"); + } + CHECK_RESULT((retVal != true), "cl-gl depth test failed "); +} + +bool OCLGLDepthBuffer::testDepthRead(GLint internalFormat, + GLenum attachmentType) { + cl_int error; + size_t dimSizes[] = {c_dimSize, c_dimSize}; + + unsigned int bufferSize = c_dimSize * c_dimSize * 4; + bool retVal = false; + + pGLOutput_ = (float*)malloc(bufferSize); + pCLOutput_ = (float*)malloc(bufferSize); + // create Frame buffer object + glGenFramebuffers(1, &frameBufferOBJ_); + + // create textures + glGenTextures(1, &colorBuffer_); + glEnable(GL_TEXTURE_2D); + glBindTexture(GL_TEXTURE_2D, colorBuffer_); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, c_dimSize, c_dimSize, 0, GL_RGBA, + GL_UNSIGNED_BYTE, 0); + glBindTexture(GL_TEXTURE_2D, 0); + // create a renderbuffer for the depth/stencil buffer + glGenRenderbuffers(1, &glDepthBuffer_); + glBindRenderbuffer(GL_RENDERBUFFER, glDepthBuffer_); + glRenderbufferStorage(GL_RENDERBUFFER, internalFormat, c_dimSize, c_dimSize); + + // + glBindFramebuffer(GL_FRAMEBUFFER, frameBufferOBJ_); + glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, colorBuffer_, 0); + glFramebufferRenderbuffer(GL_FRAMEBUFFER, attachmentType, GL_RENDERBUFFER, + glDepthBuffer_); + + GLenum status = glCheckFramebufferStatus(GL_FRAMEBUFFER); + if (GL_FRAMEBUFFER_COMPLETE != status) { + return false; + } + // set up gl state machine + glViewport(0, 0, c_dimSize, c_dimSize); // Reset The Current Viewport + glMatrixMode(GL_PROJECTION); // Select The Projection Matrix + glLoadIdentity(); // Reset The Projection Matrix + gluPerspective(30.0f, (GLfloat)c_dimSize / (GLfloat)c_dimSize, 0.1f, 100.0f); + glMatrixMode(GL_MODELVIEW); // Select The Modelview Matrix + glLoadIdentity(); + glEnable(GL_DEPTH_TEST); + // The Type Of Depth Testing To Do + glClear(GL_COLOR_BUFFER_BIT | + GL_DEPTH_BUFFER_BIT); // Clear Screen And Depth Buffer + glBegin(GL_QUADS); // Draw A Quad + glVertex3f(-1.0f, 1.0f, -6.0f); // Top Left + glVertex3f(1.0f, 1.0f, -6.0f); // Top Right + glVertex3f(1.0f, -1.0f, -3.0f); // Bottom Right + glVertex3f(-1.0f, -1.0f, -3.0f); // Bottom Left + glEnd(); + + glFinish(); + + clDepth_ = _wrapper->clCreateFromGLRenderbuffer(context_, CL_MEM_READ_WRITE, + glDepthBuffer_, &error); + if (CL_SUCCESS != error) { + printf("clCreateFromGLRenderbuffer failed\n"); + return false; + } + + clOutputBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, + bufferSize, NULL, &error); + if (CL_SUCCESS != error) return false; + + clSampler_ = _wrapper->clCreateSampler(context_, CL_FALSE, CL_ADDRESS_NONE, + CL_FILTER_NEAREST, &error); + if (CL_SUCCESS != error) return false; + + error = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 1, + &clDepth_, 0, NULL, NULL); + + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clOutputBuffer_); + + _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), &clDepth_); + + _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_sampler), &clSampler_); + + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, NULL, + dimSizes, NULL, 0, NULL, NULL); + + _wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 1, &clDepth_, 0, + NULL, NULL); + + _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], clOutputBuffer_, CL_TRUE, + 0, bufferSize, pCLOutput_, 0, NULL, NULL); + + glReadPixels(0, 0, c_dimSize, c_dimSize, GL_DEPTH_COMPONENT, GL_FLOAT, + pGLOutput_); + + // test that both resources are identical. + if (0 == memcmp(pGLOutput_, pCLOutput_, bufferSize)) { + retVal = true; // test successful + } else { + printf("expected results is different from actual results\n"); + dumpBuffer(pGLOutput_, "GLDepth.csv", c_dimSize); + dumpBuffer(pCLOutput_, "CLDepth.csv", c_dimSize); + } + + return retVal; +} + +unsigned int OCLGLDepthBuffer::close(void) { + if (pGLOutput_) { + free(pGLOutput_); + pGLOutput_ = NULL; + } + + if (pCLOutput_) { + free(pCLOutput_); + pCLOutput_ = NULL; + } + + clReleaseMemObject(clDepth_); + clReleaseMemObject(clOutputBuffer_); + clReleaseSampler(clSampler_); + // unbind the texture and frame buffer. + glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, 0, 0); + glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, 0, 0); + glBindFramebuffer(GL_FRAMEBUFFER, 0); + // clean gl resources + glDeleteFramebuffers(1, &frameBufferOBJ_); + frameBufferOBJ_ = 0; + glDeleteTextures(1, &colorBuffer_); + colorBuffer_ = 0; + glDeleteTextures(1, &glDepthBuffer_); + glDepthBuffer_ = 0; + + return OCLGLCommon::close(); +} + +// helper functions +unsigned int OCLGLDepthBuffer::formatToSize(GLint internalFormat) { + switch (internalFormat) { + case GL_DEPTH_COMPONENT32F: + return 4; + break; + case GL_DEPTH_COMPONENT16: + return 2; + break; + case GL_DEPTH24_STENCIL8: + return 4; + break; + case GL_DEPTH32F_STENCIL8: + return 8; + break; + default: + return 0; + } +} diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthBuffer.h b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthBuffer.h new file mode 100644 index 0000000000..b8a3d46ad2 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthBuffer.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_GL_DEPTH_BUFFER_H_ +#define _OCL_GL_DEPTH_BUFFER_H_ + +#include "OCLGLCommon.h" + +class OCLGLDepthBuffer : public OCLGLCommon { + public: + OCLGLDepthBuffer(); + virtual ~OCLGLDepthBuffer(); + static const unsigned int c_dimSize = 128; + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId); + virtual void run(void); + virtual unsigned int close(void); + + private: + //////////////////// + // test functions // + //////////////////// + bool testDepthRead(GLint internalFormat, GLenum attachmentType); + unsigned int _currentTest; + ///////////////////// + // private members // + ///////////////////// + // GL resource identifiers + GLuint glDepthBuffer_; + GLuint frameBufferOBJ_; + GLuint colorBuffer_; + + // CL identifiers + cl_mem clOutputBuffer_; + cl_mem clDepth_; + cl_sampler clSampler_; + + // pointers to buffers + float* pGLOutput_; + float* pCLOutput_; + bool extensionSupported_; + ////////////////////////////// + // private helper functions // + ////////////////////////////// + // returns element size in bytes. + static unsigned int formatToSize(GLint internalFormat); +}; + +#endif // _OCL_GL_BUFFER_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthTex.cpp b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthTex.cpp new file mode 100644 index 0000000000..38d8099708 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthTex.cpp @@ -0,0 +1,278 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLGLDepthTex.h" + +#include +#include +#include +#include +#include + +const static char* strKernel = + "__kernel void gldepths_test( __global float *output, read_only image2d_t " + "source, sampler_t sampler){ \n" + " int tidX = get_global_id(0);\n" + " int tidY = get_global_id(1);\n" + " float4 value = read_imagef( source, sampler, (int2)( tidX, tidY ) );\n" + " output[ tidY * get_image_width( source ) + tidX ] = value.z;\n" + "}\n"; + +OCLGLDepthTex::OCLGLDepthTex() + : glDepthBuffer_(0), + frameBufferOBJ_(0), + colorBuffer_(0), + clOutputBuffer_(0), + clDepth_(0), + clSampler_(0), + pGLOutput_(0), + pCLOutput_(0), + extensionSupported_(false) { + _numSubTests = 8; + _currentTest = 0; +} + +OCLGLDepthTex::~OCLGLDepthTex() {} + +void OCLGLDepthTex::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLGLCommon::open(test, units, conversion, deviceId); + if (_errorFlag) return; + + char* pExtensions = (char*)malloc(8192); + size_t returnSize; + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 8192, + pExtensions, &returnSize); + + // if extension if not supported + if (!strstr(pExtensions, "cl_khr_gl_depth_images")) { + free(pExtensions); + printf("skipping test depth interop not supported\n"); + return; + } + free(pExtensions); + extensionSupported_ = true; + + static const char* OpenCL20Kernel = "-cl-std=CL2.0"; + const char* options = OpenCL20Kernel; + if (test < 4) { + options = NULL; + } + _currentTest = test % 4; + + // Build the kernel + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateProgramWithSource() failed (%d)", error_); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], options, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_); + + kernel_ = _wrapper->clCreateKernel(program_, "gldepths_test", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_); +} + +void OCLGLDepthTex::run(void) { + if (_errorFlag || !extensionSupported_) { + return; + } + bool retVal; + switch (_currentTest) { + case 0: + retVal = testDepthRead(GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, + GL_UNSIGNED_INT_24_8); + break; + case 1: + retVal = + testDepthRead(GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_FLOAT); + break; + case 2: + retVal = + testDepthRead(GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT); + break; + case 3: + retVal = testDepthRead(GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL, + GL_FLOAT_32_UNSIGNED_INT_24_8_REV); + break; + default: + CHECK_RESULT(true, "unsupported test number\n"); + } + CHECK_RESULT((retVal != true), "cl-gl depth test failed "); +} + +bool OCLGLDepthTex::testDepthRead(GLint internalFormat, GLenum format, + GLenum type) { + const unsigned int bufferSize = c_dimSize * c_dimSize * 4; + + pGLOutput_ = (float*)malloc(bufferSize); + pCLOutput_ = (float*)malloc(bufferSize); + size_t dimSizes[] = {c_dimSize, c_dimSize}; + + bool retVal = false; + // create Frame buffer object + glGenFramebuffers(1, &frameBufferOBJ_); + glBindFramebuffer(GL_FRAMEBUFFER, frameBufferOBJ_); + + // create textures + glGenTextures(1, &colorBuffer_); + glBindTexture(GL_TEXTURE_2D, colorBuffer_); + + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, c_dimSize, c_dimSize, 0, GL_RGBA, + GL_UNSIGNED_BYTE, 0); + + glGenTextures(1, &glDepthBuffer_); + glBindTexture(GL_TEXTURE_2D, glDepthBuffer_); + glTexImage2D(GL_TEXTURE_2D, 0, internalFormat, c_dimSize, c_dimSize, 0, + format, type, 0); + GLint glError = glGetError(); + // + glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, colorBuffer_, 0); + + if (GL_DEPTH_COMPONENT == format) { + glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, glDepthBuffer_, + 0); + } else { + glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, + glDepthBuffer_, 0); + } + + glBindFramebuffer(GL_FRAMEBUFFER, frameBufferOBJ_); + + GLenum status = glCheckFramebufferStatus(GL_FRAMEBUFFER); + if (GL_FRAMEBUFFER_COMPLETE != status) { + printf("frame buffer incomplete!\n"); + return false; + } + // set up gl state machine + glViewport(0, 0, c_dimSize, c_dimSize); // Reset The Current Viewport + glMatrixMode(GL_PROJECTION); // Select The Projection Matrix + glLoadIdentity(); // Reset The Projection Matrix + gluPerspective(30.0f, (GLfloat)c_dimSize / (GLfloat)c_dimSize, 0.1f, 100.0f); + glMatrixMode(GL_MODELVIEW); // Select The Modelview Matrix + glLoadIdentity(); + glEnable(GL_DEPTH_TEST); + glBindFramebuffer(GL_FRAMEBUFFER, frameBufferOBJ_); + + cl_int error; + + clOutputBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, + bufferSize, NULL, &error); + if (CL_SUCCESS != error) return false; + + clSampler_ = _wrapper->clCreateSampler(context_, CL_FALSE, CL_ADDRESS_NONE, + CL_FILTER_NEAREST, &error); + if (CL_SUCCESS != error) return false; + + clDepth_ = _wrapper->clCreateFromGLTexture( + context_, CL_MEM_READ_ONLY, GL_TEXTURE_2D, 0, glDepthBuffer_, &error); + if (CL_SUCCESS != error) return false; + + for (int i = 0; i < 3; ++i) { + // The Type Of Depth Testing To Do + glClear(GL_COLOR_BUFFER_BIT | + GL_DEPTH_BUFFER_BIT); // Clear Screen And Depth Buffer + + const float zValues[3][2] = { + {-6.f, -3.f}, + {-5.f, -2.f}, + {-4.f, -1.f}, + }; + + glBegin(GL_QUADS); // Draw A Quad + glVertex3f(-1.0f, 1.0f, zValues[i][0]); // Top Left + glVertex3f(1.0f, 1.0f, zValues[i][0]); // Top Right + glVertex3f(1.0f, -1.0f, zValues[i][1]); // Bottom Right + glVertex3f(-1.0f, -1.0f, zValues[i][1]); // Bottom Left + glEnd(); + + glFinish(); + + error = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 1, + &clDepth_, 0, NULL, NULL); + + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clOutputBuffer_); + + _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), &clDepth_); + + _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_sampler), &clSampler_); + + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, NULL, + dimSizes, NULL, 0, NULL, NULL); + + _wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 1, &clDepth_, 0, + NULL, NULL); + + _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], clOutputBuffer_, + CL_TRUE, 0, bufferSize, pCLOutput_, 0, NULL, + NULL); + + glReadPixels(0, 0, c_dimSize, c_dimSize, GL_DEPTH_COMPONENT, GL_FLOAT, + pGLOutput_); + + // test that both resources are identical. + if (0 == memcmp(pGLOutput_, pCLOutput_, bufferSize)) { + retVal = true; // test successful + } else { + printf("expected results is different from actual results\n"); + dumpBuffer(pGLOutput_, "GLDepth.csv", c_dimSize); + dumpBuffer(pCLOutput_, "clDepth_.csv", c_dimSize); + } + } + + return retVal; +} + +unsigned int OCLGLDepthTex::close(void) { + if (pGLOutput_) { + free(pGLOutput_); + pGLOutput_ = NULL; + } + + if (pCLOutput_) { + free(pCLOutput_); + pCLOutput_ = NULL; + } + + clReleaseMemObject(clDepth_); + clReleaseMemObject(clOutputBuffer_); + clReleaseSampler(clSampler_); + // unbind the texture and frame buffer. + glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, 0, 0); + glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, 0, 0); + glBindFramebuffer(GL_FRAMEBUFFER, 0); + // clean gl resources + glDeleteFramebuffers(1, &frameBufferOBJ_); + frameBufferOBJ_ = 0; + glDeleteTextures(1, &colorBuffer_); + colorBuffer_ = 0; + glDeleteTextures(1, &glDepthBuffer_); + glDepthBuffer_ = 0; + + return OCLGLCommon::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthTex.h b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthTex.h new file mode 100644 index 0000000000..07be55d678 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLDepthTex.h @@ -0,0 +1,62 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_GL_DEPTH_TEX_H_ +#define _OCL_GL_DEPTH_TEX_H_ + +#include "OCLGLCommon.h" + +class OCLGLDepthTex : public OCLGLCommon { + public: + OCLGLDepthTex(); + virtual ~OCLGLDepthTex(); + static const unsigned int c_dimSize = 128; + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId); + virtual void run(void); + virtual unsigned int close(void); + + private: + //////////////////// + // test functions // + //////////////////// + bool testDepthRead(GLint internalFormat, GLenum format, GLenum type); + unsigned int _currentTest; + + ///////////////////// + // private members // + ///////////////////// + // GL resource identifiers + GLuint glDepthBuffer_; + GLuint frameBufferOBJ_; + GLuint colorBuffer_; + + // CL identifiers + cl_mem clOutputBuffer_; + cl_mem clDepth_; + cl_sampler clSampler_; + + // pointers to buffers + float* pGLOutput_; + float* pCLOutput_; + bool extensionSupported_; +}; + +#endif // _OCL_GL_BUFFER_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLFenceSync.cpp b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLFenceSync.cpp new file mode 100644 index 0000000000..9d16495e1b --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLFenceSync.cpp @@ -0,0 +1,481 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLGLFenceSync.h" + +#include +#include +#include +#include + +#include "Timer.h" +#ifndef WIN_OS +#include +#endif + +const static char *strKernel = + "__kernel void glmulticontext_test( __global uint4 *source, __global uint4 " + "*dest) \n" + "{ " + " \n" + " int tid = get_global_id(0); " + " \n" + " dest[ tid ] = source [ tid ] + (uint4)(1); " + " \n" + "} " + " \n"; + +OCLGLFenceSync::OCLGLFenceSync() { + memset(contextData_, 0, sizeof(contextData_)); + _numSubTests = 2; +} + +OCLGLFenceSync::~OCLGLFenceSync() {} + +#ifdef WIN_OS +typedef GLsync(__stdcall *glFenceSyncPtr)(GLenum condition, GLbitfield flags); +typedef bool(__stdcall *glIsSyncPtr)(GLsync sync); +typedef void(__stdcall *glDeleteSyncPtr)(GLsync sync); +typedef GLenum(__stdcall *glClientWaitSyncPtr)(GLsync sync, GLbitfield flags, + GLuint64 timeout); +typedef void(__stdcall *glWaitSyncPtr)(GLsync sync, GLbitfield flags, + GLuint64 timeout); +typedef void(__stdcall *glGetInteger64vPtr)(GLenum pname, GLint64 *params); +typedef void(__stdcall *glGetSyncivPtr)(GLsync sync, GLenum pname, + GLsizei bufSize, GLsizei *length, + GLint *values); +#else +typedef GLsync (*glFenceSyncPtr)(GLenum condition, GLbitfield flags); +typedef bool (*glIsSyncPtr)(GLsync sync); +typedef void (*glDeleteSyncPtr)(GLsync sync); +typedef GLenum (*glClientWaitSyncPtr)(GLsync sync, GLbitfield flags, + GLuint64 timeout); +typedef void (*glWaitSyncPtr)(GLsync sync, GLbitfield flags, GLuint64 timeout); +typedef void (*glGetInteger64vPtr)(GLenum pname, GLint64 *params); +typedef void (*glGetSyncivPtr)(GLsync sync, GLenum pname, GLsizei bufSize, + GLsizei *length, GLint *values); +#endif + +typedef struct __GLsync *GLsync; + +glFenceSyncPtr glFenceSyncFunc; + +glIsSyncPtr glIsSyncFunc; + +glDeleteSyncPtr glDeleteSyncFunc; + +glClientWaitSyncPtr glClientWaitSyncFunc; + +glWaitSyncPtr glWaitSyncFunc; + +glGetInteger64vPtr glGetInteger64vFunc; + +glGetSyncivPtr glGetSyncivFunc; + +#define CHK_GL_ERR() printf("%s\n", gluErrorString(glGetError())) + +#define cl_khr_gl_event 1 + +static void InitSyncFns() { +#ifdef WIN_OS + glFenceSyncFunc = (glFenceSyncPtr)wglGetProcAddress("glFenceSync"); + glIsSyncFunc = (glIsSyncPtr)wglGetProcAddress("glIsSync"); + glDeleteSyncFunc = (glDeleteSyncPtr)wglGetProcAddress("glDeleteSync"); + glClientWaitSyncFunc = + (glClientWaitSyncPtr)wglGetProcAddress("glClientWaitSync"); + glWaitSyncFunc = (glWaitSyncPtr)wglGetProcAddress("glWaitSync"); + glGetInteger64vFunc = + (glGetInteger64vPtr)wglGetProcAddress("glGetInteger64v"); + glGetSyncivFunc = (glGetSyncivPtr)wglGetProcAddress("glGetSynciv"); +#else + glFenceSyncFunc = (glFenceSyncPtr)glXGetProcAddress((GLubyte *)"glFenceSync"); + glIsSyncFunc = (glIsSyncPtr)glXGetProcAddress((GLubyte *)"glIsSync"); + glDeleteSyncFunc = + (glDeleteSyncPtr)glXGetProcAddress((GLubyte *)"glDeleteSync"); + glClientWaitSyncFunc = + (glClientWaitSyncPtr)glXGetProcAddress((GLubyte *)"glClientWaitSync"); + glWaitSyncFunc = (glWaitSyncPtr)glXGetProcAddress((GLubyte *)"glWaitSync"); + glGetInteger64vFunc = + (glGetInteger64vPtr)glXGetProcAddress((GLubyte *)"glGetInteger64v"); + glGetSyncivFunc = (glGetSyncivPtr)glXGetProcAddress((GLubyte *)"glGetSynciv"); +#endif +} + +#define USING_ARB_sync 1 + +typedef cl_event(CL_API_CALL *clCreateEventFromGLsyncKHR_fn)( + cl_context context, GLsync sync, cl_int *errCode_ret); + +clCreateEventFromGLsyncKHR_fn clCreateEventFromGLsyncKHR_ptr; + +/* Helper to determine if an extension is supported by a device */ +int is_extension_available(cl_device_id device, const char *extensionName) { + char *extString; + size_t size = 0; + int err; + int result = -1; + + if ((err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &size))) { + printf( + "Error: failed to determine size of device extensions string (err = " + "%d)\n", + err); + return -2; + } + + if (0 == size) return -3; + + extString = (char *)malloc(size); + if (NULL == extString) { + printf( + "Error: unable to allocate %ld byte buffer for extension string (err = " + "%d)\n", + (long)size, err); + return -40; + } + + if ((err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, size, extString, + NULL))) { + printf("Error: failed to obtain device extensions string (err = %d)\n", + err); + free(extString); + return -5; + } + + if (strstr(extString, extensionName)) result = 0; + + free(extString); + return result; +} + +void OCLGLFenceSync::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + _openTest = test; + + // Initialize random number seed + srand((unsigned int)time(NULL)); + + OCLGLCommon::open(test, units, conversion, deviceId); + if (_errorFlag) return; + + cl_context_properties properties[7] = {0}; + for (unsigned int i = 0; i < c_glContextCount; i++) { + error_ = is_extension_available(devices_[_deviceId], "cl_khr_gl_event"); + if (error_ != CL_SUCCESS) { + printf("Silent failure: cl_khr_gl_event extension not available (%d)\n", + error_); + extensionSupported_ = false; + return; + } + extensionSupported_ = true; + + createGLContext(contextData_[i].glContext); + getCLContextPropertiesFromGLContext(contextData_[i].glContext, properties); + + // Create new CL context from GL context + contextData_[i].clContext = _wrapper->clCreateContext( + properties, 1, &devices_[_deviceId], NULL, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext() failed (%d)", + error_); + + // Create command queue for new context + contextData_[i].clCmdQueue = _wrapper->clCreateCommandQueue( + contextData_[i].clContext, devices_[_deviceId], 0, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed (%d)", + error_); + + // Build the kernel + contextData_[i].clProgram = _wrapper->clCreateProgramWithSource( + contextData_[i].clContext, 1, &strKernel, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateProgramWithSource() failed (%d)", error_); + + error_ = _wrapper->clBuildProgram(contextData_[i].clProgram, 1, + &devices_[deviceId], NULL, NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(contextData_[i].clProgram, + devices_[deviceId], CL_PROGRAM_BUILD_LOG, + 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", + error_); + + contextData_[i].clKernel = _wrapper->clCreateKernel( + contextData_[i].clProgram, "glmulticontext_test", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", + error_); + } +} + +void OCLGLFenceSync::run() { + if (_errorFlag || !extensionSupported_) { + return; + } + + CPerfCounter timer; + double sec; + float perf; + cl_uint4 inOutData[c_numOfElements] = {{{0}}}; + cl_uint4 expectedData[c_numOfElements] = {{{0}}}; + unsigned int m = sizeof(cl_uint4) / sizeof(cl_uint); + int count = 0; + // Initialize input data with random values + for (unsigned int i = 0; i < c_numOfElements; i++) { + for (unsigned int j = 0; j < m; j++) { + inOutData[i].s[j] = (unsigned int)i; + expectedData[i].s[j] = inOutData[i].s[j] + c_glContextCount; + } + } + + cl_event fenceEvent0 = NULL, fenceEvent = NULL; + GLsync glFence0 = NULL, glFence = NULL; + InitSyncFns(); + + clCreateEventFromGLsyncKHR_ptr = + (clCreateEventFromGLsyncKHR_fn)clGetExtensionFunctionAddress( + "clCreateEventFromGLsyncKHR"); + if (clCreateEventFromGLsyncKHR_ptr == NULL) { + printf( + "ERROR: Unable to run fence_sync test (clCreateEventFromGLsyncKHR " + "function not discovered!)\n"); + return; + } + + for (unsigned int i = 0; i < c_glContextCount; i++) { + makeCurrent(contextData_[i].glContext); + + // Generate and Bind in & out OpenGL buffers + GLuint inGLBuffer = 0, outGLBuffer = 0; + glGenBuffers(1, &inGLBuffer); + glGenBuffers(1, &outGLBuffer); + + glBindBuffer(GL_ARRAY_BUFFER, inGLBuffer); + glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), inOutData, + GL_STATIC_DRAW); + + glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer); + glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), NULL, + GL_STATIC_DRAW); + + glBindBuffer(GL_ARRAY_BUFFER, 0); + + glFinish(); + + // Checking if clWaitForEvents works + switch (_openTest) { + case 0: // Using fence sync + glFence0 = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + CHECK_RESULT((glFence0 == NULL), "Unable to create GL fence"); + + fenceEvent0 = clCreateEventFromGLsyncKHR_ptr(contextData_[i].clContext, + glFence0, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "Unable to create CL event from GL fence (%d)", error_); + + error_ = clWaitForEvents(1, &fenceEvent0); + CHECK_RESULT((error_ != CL_SUCCESS), "clWaitForEvents() failed (%d)", + error_); + break; + default: + glFinish(); + break; + } + + if (fenceEvent != NULL) { + clReleaseEvent(fenceEvent0); + glDeleteSync(glFence0); + } + + cl_event acqEvent1 = 0, acqEvent2 = 0, kernelEvent = 0, relEvent1 = 0, + relEvent2 = 0; + + // Create input buffer from GL input buffer + contextData_[i].inputBuffer = _wrapper->clCreateFromGLBuffer( + contextData_[i].clContext, CL_MEM_READ_ONLY, inGLBuffer, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "Unable to create input GL buffer (%d)", error_); + + // Create output buffer from GL output buffer + contextData_[i].outputBuffer = _wrapper->clCreateFromGLBuffer( + contextData_[i].clContext, CL_MEM_WRITE_ONLY, outGLBuffer, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "Unable to create output GL buffer (%d)", error_); + + timer.Reset(); + switch (_openTest) { + case 0: // Using fence sync + timer.Start(); + glFence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + timer.Stop(); + CHECK_RESULT((glFence == NULL), "Unable to create GL fence"); + + timer.Start(); + fenceEvent = clCreateEventFromGLsyncKHR_ptr(contextData_[i].clContext, + glFence, &error_); + timer.Stop(); + CHECK_RESULT((error_ != CL_SUCCESS), + "Unable to create CL event from GL fence (%d)", error_); + break; + default: + break; + } + + error_ = + _wrapper->clSetKernelArg(contextData_[i].clKernel, 0, sizeof(cl_mem), + &(contextData_[i].inputBuffer)); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)", + error_); + + error_ = + _wrapper->clSetKernelArg(contextData_[i].clKernel, 1, sizeof(cl_mem), + &(contextData_[i].outputBuffer)); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)", + error_); + + switch (_openTest) { + case 0: // Using fence sync + timer.Start(); + error_ = _wrapper->clEnqueueAcquireGLObjects( + contextData_[i].clCmdQueue, 1, &(contextData_[i].inputBuffer), 1, + &fenceEvent, &acqEvent1); + timer.Stop(); + CHECK_RESULT((error_ != CL_SUCCESS), + "Unable to acquire GL objects (%d)", error_); + + timer.Start(); + error_ = _wrapper->clEnqueueAcquireGLObjects( + contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 1, + &fenceEvent, &acqEvent2); + timer.Stop(); + CHECK_RESULT((error_ != CL_SUCCESS), + "Unable to acquire GL objects (%d)", error_); + break; + case 1: // Using glFinish + timer.Start(); + glFinish(); + timer.Stop(); + + timer.Start(); + error_ = _wrapper->clEnqueueAcquireGLObjects( + contextData_[i].clCmdQueue, 1, &(contextData_[i].inputBuffer), 0, + NULL, &acqEvent1); + timer.Stop(); + CHECK_RESULT((error_ != CL_SUCCESS), + "Unable to acquire GL objects (%d)", error_); + + timer.Start(); + error_ = _wrapper->clEnqueueAcquireGLObjects( + contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 0, + NULL, &acqEvent2); + timer.Stop(); + CHECK_RESULT((error_ != CL_SUCCESS), + "Unable to acquire GL objects (%d)", error_); + break; + default: + break; + } + + size_t gws[1] = {c_numOfElements}; + cl_event evts[2] = {acqEvent1, acqEvent2}; + error_ = _wrapper->clEnqueueNDRangeKernel(contextData_[i].clCmdQueue, + contextData_[i].clKernel, 1, NULL, + gws, NULL, 2, evts, &kernelEvent); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed (%d)", + error_); + + error_ = _wrapper->clEnqueueReleaseGLObjects(contextData_[i].clCmdQueue, 1, + &(contextData_[i].inputBuffer), + 1, &kernelEvent, &relEvent1); + CHECK_RESULT((error_ != CL_SUCCESS), + "clEnqueueReleaseGLObjects failed (%d)", error_); + + error_ = _wrapper->clEnqueueReleaseGLObjects( + contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 1, + &kernelEvent, &relEvent2); + CHECK_RESULT((error_ != CL_SUCCESS), + "clEnqueueReleaseGLObjects failed (%d)", error_); + + evts[0] = relEvent1; + evts[1] = relEvent2; + error_ = clWaitForEvents(2, evts); + CHECK_RESULT((error_ != CL_SUCCESS), "clWaitForEvents() failed (%d)", + error_); + + glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer); + void *glMem = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY); + memcpy(inOutData, glMem, c_numOfElements * sizeof(cl_uint4)); + glUnmapBuffer(GL_ARRAY_BUFFER); + + _wrapper->clReleaseMemObject(contextData_[i].inputBuffer); + _wrapper->clReleaseMemObject(contextData_[i].outputBuffer); + + // Delete GL buffers + glBindBuffer(GL_ARRAY_BUFFER, 0); + glDeleteBuffers(1, &inGLBuffer); + inGLBuffer = 0; + glDeleteBuffers(1, &outGLBuffer); + outGLBuffer = 0; + } + + sec = timer.GetElapsedTime(); + perf = (float)sec * 1000000; // in microseconds + _perfInfo = (float)perf; + + if (fenceEvent != NULL) { + clReleaseEvent(fenceEvent); + glDeleteSync(glFence); + } + + // Compare expected output with actual data received + for (unsigned int i = 0; i < c_numOfElements; i++) { + for (unsigned int j = 0; j < m; j++) { + if (inOutData[i].s[j] != expectedData[i].s[j]) { + printf( + "Element %u is incorrect!\t expected:[ %u, %u, %u, %u ] differs " + "from actual:{%u, %u, %u, %u}\n", + i, expectedData[i].s[0], expectedData[i].s[1], expectedData[i].s[2], + expectedData[i].s[3], inOutData[i].s[0], inOutData[i].s[1], + inOutData[i].s[2], inOutData[i].s[3]); + + count++; + } + } + } + if (count) printf("Number of elements wrong: %d\n", count); +} + +unsigned int OCLGLFenceSync::close() { + error_ = is_extension_available(devices_[_deviceId], "cl_khr_gl_event"); + if (error_ == CL_SUCCESS) { + for (unsigned int i = 0; i < c_glContextCount; i++) { + makeCurrent(contextData_[i].glContext); + _wrapper->clReleaseKernel(contextData_[i].clKernel); + _wrapper->clReleaseProgram(contextData_[i].clProgram); + _wrapper->clReleaseCommandQueue(contextData_[i].clCmdQueue); + _wrapper->clReleaseContext(contextData_[i].clContext); + destroyGLContext(contextData_[i].glContext); + } + } + + return OCLGLCommon::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLFenceSync.h b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLFenceSync.h new file mode 100644 index 0000000000..af168485cc --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLFenceSync.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_GL_FENCE_SYNC_H_ +#define _OCL_GL_FENCE_SYNC_H_ + +#include "OCLGLCommon.h" + +class OCLGLFenceSync : public OCLGLCommon { + public: + OCLGLFenceSync(); + virtual ~OCLGLFenceSync(); + + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId); + virtual void run(void); + virtual unsigned int close(void); + + private: + static const unsigned int c_glContextCount = 1; + static const unsigned int c_numOfElements = 8192; + + struct GLContextDataSet { + OCLGLHandle glContext; + cl_context clContext; + cl_command_queue clCmdQueue; + cl_program clProgram; + cl_kernel clKernel; + cl_mem inputBuffer; + cl_mem outputBuffer; + }; + GLContextDataSet contextData_[c_glContextCount]; + + bool failed_; + bool extensionSupported_; +}; + +#endif // _OCL_GL_FENCE_SYNC_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMsaaTexture.cpp b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMsaaTexture.cpp new file mode 100644 index 0000000000..c2ba6a10f4 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMsaaTexture.cpp @@ -0,0 +1,298 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLGLMsaaTexture.h" + +#include +#include +#include +#include + +const static char* strKernel = + "__kernel void gl_msaa_test( __global uint4 *output, read_only " + "image2d_msaa_t source, unsigned int numSamples){ \n" + " int tidX = get_global_id(0);\n" + " int tidY = get_global_id(1);\n" + " for (int i = 0 ; i < numSamples ; i++) {\n" + " uint4 value = read_imageui( source, (int2)( tidX, tidY ) ,i);\n" + " int index = (tidY * get_image_width( source ) + tidX)*numSamples + " + "i;\n" + " output[ index ] = value;\n" + " }\n" + "}\n"; + +const static char* glDownSampleShader = + "uniform sampler2DMS MsaaTex;\n" + "uniform int numSamples;\n" + "uniform ivec2 resolution;\n" + "\n" + "varying vec4 gl_TexCoord[ ]; \n" + "\n" + "void main(void)\n" + "{\n" + " vec4 accum = vec4(0.0,0.0,0.0,0.0);\n" + " ivec2 coord = ivec2(resolution * gl_TexCoord[0].xy) ;\n" + " for ( int i = 0 ; i < numSamples ; i++)\n" + " {\n" + " accum += texelFetch(MsaaTex,coord,i);\n" + " }\n" + " accum /= numSamples;\n" + " \n" + " \n" + " \n" + " gl_FragColor = accum;\n" + "}"; + +OCLGLMsaaTexture::OCLGLMsaaTexture() + : msaaDepthBuffer_(0), + msaaFrameBufferOBJ_(0), + msaaColorBuffer_(0), + glShader_(0), + glprogram_(0), + clOutputBuffer_(0), + clMsaa_(0), + pGLOutput_(0), + pCLOutput_(0) { + _numSubTests = 1; + _currentTest = 0; +} + +OCLGLMsaaTexture::~OCLGLMsaaTexture() {} + +void OCLGLMsaaTexture::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLGLCommon::open(test, units, conversion, deviceId); + if (_errorFlag) return; + + _currentTest = test; + + // Build the kernel + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateProgramWithSource() failed (%d)", error_); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_); + + kernel_ = _wrapper->clCreateKernel(program_, "gl_msaa_test", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_); +} + +void OCLGLMsaaTexture::run(void) { + if (_errorFlag) { + return; + } + bool retVal; + switch (_currentTest) { + case 0: + retVal = testMsaaRead(GL_RGBA, 2); + break; + default: + CHECK_RESULT(true, "unsupported test number\n"); + } + CHECK_RESULT((retVal != true), "cl-gl depth test failed "); +} + +unsigned int OCLGLMsaaTexture::close(void) { + if (pGLOutput_) { + free(pGLOutput_); + pGLOutput_ = NULL; + } + + if (pCLOutput_) { + free(pCLOutput_); + pCLOutput_ = NULL; + } + + clReleaseMemObject(clMsaa_); + clReleaseMemObject(clOutputBuffer_); + + glFinish(); + // unbind the texture and frame buffer. + glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, 0, 0); + glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, 0, 0); + glBindFramebuffer(GL_FRAMEBUFFER, 0); + glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, 0); + + // clean gl resources + glDeleteFramebuffers(1, &msaaFrameBufferOBJ_); + msaaFrameBufferOBJ_ = 0; + glDeleteTextures(1, &msaaColorBuffer_); + msaaColorBuffer_ = 0; + glDeleteTextures(1, &msaaDepthBuffer_); + msaaDepthBuffer_ = 0; + + glDeleteProgram(glprogram_); + glDeleteShader(glShader_); + + return OCLGLCommon::close(); +} + +bool OCLGLMsaaTexture::testMsaaRead(GLint internalFormat, + unsigned int numSamples) { + size_t dimSizes[] = {c_dimSize, c_dimSize}; + + unsigned int bufferSize = c_dimSize * c_dimSize * 4; + bool retVal = false; + createGLFragmentProgramFromSource(glDownSampleShader, glShader_, glprogram_); + + ///////////////////// + // create msaa FBO // + ///////////////////// + glGenFramebuffers(1, &msaaFrameBufferOBJ_); + glBindFramebuffer(GL_FRAMEBUFFER, msaaFrameBufferOBJ_); + + // create textures + glGenTextures(1, &msaaColorBuffer_); + glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, msaaColorBuffer_); + glTexImage2DMultisample(GL_TEXTURE_2D_MULTISAMPLE, numSamples, GL_RGBA8, + c_dimSize, c_dimSize, GL_TRUE); + + glGenTextures(1, &msaaDepthBuffer_); + glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, msaaDepthBuffer_); + glTexImage2DMultisample(GL_TEXTURE_2D_MULTISAMPLE, numSamples, + GL_DEPTH_COMPONENT24, c_dimSize, c_dimSize, GL_TRUE); + + // + glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, msaaColorBuffer_, + 0); + glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, msaaDepthBuffer_, + 0); + + // verify all resource allocations are well. + GLenum status = glCheckFramebufferStatus(GL_FRAMEBUFFER); + if (GL_FRAMEBUFFER_COMPLETE != status) { + return false; + } + // set up gl state machine + glViewport(0, 0, c_dimSize, c_dimSize); // Reset The Current Viewport + glMatrixMode(GL_PROJECTION); // Select The Projection Matrix + glLoadIdentity(); // Reset The Projection Matrix + gluPerspective(30.0f, (GLfloat)c_dimSize / (GLfloat)c_dimSize, 0.1f, 100.0f); + glMatrixMode(GL_MODELVIEW); // Select The Modelview Matrix + glLoadIdentity(); + glEnable(GL_DEPTH_TEST); + // The Type Of Depth Testing To Do + glClear(GL_COLOR_BUFFER_BIT | + GL_DEPTH_BUFFER_BIT); // Clear Screen And Depth Buffer + glBegin(GL_QUADS); // Draw A Quad + glVertex3f(-1.0f, 1.0f, -6.0f); // Top Left + glVertex3f(1.0f, 1.0f, -6.0f); // Top Right + glVertex3f(1.0f, -1.0f, -3.0f); // Bottom Right + glVertex3f(-1.0f, -1.0f, -3.0f); // Bottom Left + glEnd(); + + glFinish(); + cl_int error; + clOutputBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, + bufferSize, NULL, &error); + if (CL_SUCCESS != error) return false; + + clMsaa_ = _wrapper->clCreateFromGLTexture(context_, CL_MEM_READ_WRITE, + GL_TEXTURE_2D_MULTISAMPLE, 0, + msaaColorBuffer_, &error); + if (CL_SUCCESS != error) return false; + + GLsizei samples; + error = _wrapper->clGetGLTextureInfo(clMsaa_, CL_GL_NUM_SAMPLES, + sizeof(samples), &samples, NULL); + + error = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 1, + &clMsaa_, 0, NULL, NULL); + if (CL_SUCCESS != error) return false; + + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clOutputBuffer_); + + _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), &clMsaa_); + + _wrapper->clSetKernelArg(kernel_, 2, sizeof(unsigned int), &numSamples); + + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, NULL, + dimSizes, NULL, 0, NULL, NULL); + + _wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 1, &clMsaa_, 0, + NULL, NULL); + + pGLOutput_ = (unsigned int*)malloc(bufferSize); + pCLOutput_ = (unsigned int*)malloc(bufferSize); + + _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], clOutputBuffer_, CL_TRUE, + 0, bufferSize, pCLOutput_, 0, NULL, NULL); + + // down sample + glBindFramebuffer(GL_FRAMEBUFFER, 0); + glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, msaaColorBuffer_); + glUseProgram(glprogram_); + + glUniform1i(glGetUniformLocation(glprogram_, "numSamples"), numSamples); + glUniform2i(glGetUniformLocation(glprogram_, "resolution"), c_dimSize, + c_dimSize); + glUniform1i(glGetUniformLocation(glprogram_, "MsaaTex"), 0); + + // printOpenGLError(); + + glBegin(GL_QUADS); + glVertex2f(-1.0f, 1.0f); + glTexCoord2f(1.0f, 0.0f); + glVertex2f(1.0f, 1.0f); + glTexCoord2f(1.0f, 1.0f); + glVertex2f(1.0f, -1.0f); + glTexCoord2f(0.0f, 1.0f); + glVertex2f(-1.0f, -1.0f); + glTexCoord2f(0.0f, 0.0f); + glEnd(); + + glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, 0); + glUseProgram(0); + + glReadPixels(0, 0, c_dimSize, c_dimSize, GL_BGRA, GL_UNSIGNED_BYTE, + pGLOutput_); + + if (absDiff(pGLOutput_, pCLOutput_, c_dimSize)) retVal = true; + + return retVal; +} + +bool OCLGLMsaaTexture::absDiff(unsigned int* pGLBuffer, unsigned int* pCLBuffer, + const unsigned int c_dimSize) { + bool retVal = true; + for (unsigned int i = 0; i < c_dimSize * c_dimSize; i++) { + char clPixel[4]; + char glPixel[4]; + char diff[4] = {0}; + memcpy(clPixel, &(pCLBuffer[i]), sizeof(clPixel)); + memcpy(glPixel, &(pGLBuffer[i]), sizeof(glPixel)); + + for (int j = 0; j < 4; j++) { + diff[j] = abs(clPixel[j] - glPixel[i]); + if (diff[j] > 10) retVal = false; + } + } + return retVal; +} diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMsaaTexture.h b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMsaaTexture.h new file mode 100644 index 0000000000..f3c1ab6296 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMsaaTexture.h @@ -0,0 +1,68 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_GL_MSAA_TEXTURE_H_ +#define _OCL_GL_MSAA_TEXTURE_H_ + +#include "OCLGLCommon.h" + +class OCLGLMsaaTexture : public OCLGLCommon { + public: + OCLGLMsaaTexture(); + virtual ~OCLGLMsaaTexture(); + static const unsigned int c_dimSize = 128; + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId); + virtual void run(void); + virtual unsigned int close(void); + + private: + //////////////////// + // test functions // + //////////////////// + bool testMsaaRead(GLint internalFormat, unsigned int NumSamples); + unsigned int _currentTest; + + ////////////////////////////// + // private helper functions // + ////////////////////////////// + + // returns element size in bytes. + static bool absDiff(unsigned int* pGLBuffer, unsigned int* pCLBuffer, + const unsigned int dimSize); + + ///////////////////// + // private members // + ///////////////////// + // GL resource identifiers + GLuint msaaDepthBuffer_; + GLuint msaaFrameBufferOBJ_; + GLuint msaaColorBuffer_; + GLuint glShader_; + GLuint glprogram_; + // CL identifiers + cl_mem clOutputBuffer_; + cl_mem clMsaa_; + + unsigned int* pGLOutput_; + unsigned int* pCLOutput_; +}; + +#endif // _OCL_GL_BUFFER_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMultiContext.cpp b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMultiContext.cpp new file mode 100644 index 0000000000..f46640741e --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMultiContext.cpp @@ -0,0 +1,231 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLGLMultiContext.h" + +#include +#include +#include +#include + +const static char* strKernel = + "__kernel void glmulticontext_test( __global uint4 *source, __global uint4 " + "*dest) \n" + "{ " + " \n" + " int tid = get_global_id(0); " + " \n" + " dest[ tid ] = source[ tid ] + (uint4)(1); " + " \n" + "} " + " \n"; + +OCLGLMultiContext::OCLGLMultiContext() { + memset(contextData_, 0, sizeof(contextData_)); + _numSubTests = 1; +} + +OCLGLMultiContext::~OCLGLMultiContext() {} + +void OCLGLMultiContext::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + // Initialize random number seed + srand((unsigned int)time(NULL)); + + OCLGLCommon::open(test, units, conversion, deviceId); + if (_errorFlag) return; + + cl_context_properties properties[7] = {0}; + for (unsigned int i = 0; i < c_glContextCount; i++) { + createGLContext(contextData_[i].glContext); + getCLContextPropertiesFromGLContext(contextData_[i].glContext, properties); + + // Create new CL context from GL context + contextData_[i].clContext = _wrapper->clCreateContext( + properties, 1, &devices_[_deviceId], NULL, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext() failed (%d)", + error_); + + // Create command queue for new context + contextData_[i].clCmdQueue = _wrapper->clCreateCommandQueue( + contextData_[i].clContext, devices_[_deviceId], 0, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed (%d)", + error_); + + // Build the kernel + contextData_[i].clProgram = _wrapper->clCreateProgramWithSource( + contextData_[i].clContext, 1, &strKernel, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateProgramWithSource() failed (%d)", error_); + + error_ = _wrapper->clBuildProgram(contextData_[i].clProgram, 1, + &devices_[deviceId], NULL, NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(contextData_[i].clProgram, + devices_[deviceId], CL_PROGRAM_BUILD_LOG, + 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", + error_); + + contextData_[i].clKernel = _wrapper->clCreateKernel( + contextData_[i].clProgram, "glmulticontext_test", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", + error_); + } +} + +void OCLGLMultiContext::run() { + if (_errorFlag) { + return; + } + + cl_uint4 inOutData[c_numOfElements] = {{{0}}}; + cl_uint4 expectedData[c_numOfElements] = {{{0}}}; + + // Initialize input data with random values + for (unsigned int i = 0; i < c_numOfElements; i++) { + for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) { + inOutData[i].s[j] = (unsigned int)rand(); + expectedData[i].s[j] = inOutData[i].s[j] + c_glContextCount; + } + } + + for (unsigned int i = 0; i < c_glContextCount; i++) { + makeCurrent(contextData_[i].glContext); + + // Generate and Bind in & out OpenGL buffers + GLuint inGLBuffer = 0, outGLBuffer = 0; + glGenBuffers(1, &inGLBuffer); + glGenBuffers(1, &outGLBuffer); + + glBindBuffer(GL_ARRAY_BUFFER, inGLBuffer); + glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), inOutData, + GL_STATIC_DRAW); + + glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer); + glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), NULL, + GL_STATIC_DRAW); + + glBindBuffer(GL_ARRAY_BUFFER, 0); + glFinish(); + + // Create input buffer from GL input buffer + contextData_[i].inputBuffer = _wrapper->clCreateFromGLBuffer( + contextData_[i].clContext, CL_MEM_READ_ONLY, inGLBuffer, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "Unable to create input GL buffer (%d)", error_); + + // Create output buffer from GL output buffer + contextData_[i].outputBuffer = _wrapper->clCreateFromGLBuffer( + contextData_[i].clContext, CL_MEM_WRITE_ONLY, outGLBuffer, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "Unable to create output GL buffer (%d)", error_); + + error_ = + _wrapper->clSetKernelArg(contextData_[i].clKernel, 0, sizeof(cl_mem), + &(contextData_[i].inputBuffer)); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)", + error_); + + error_ = + _wrapper->clSetKernelArg(contextData_[i].clKernel, 1, sizeof(cl_mem), + &(contextData_[i].outputBuffer)); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)", + error_); + + error_ = _wrapper->clEnqueueAcquireGLObjects(contextData_[i].clCmdQueue, 1, + &(contextData_[i].inputBuffer), + 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)", + error_); + + error_ = _wrapper->clEnqueueAcquireGLObjects( + contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 0, NULL, + NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)", + error_); + + size_t gws[1] = {c_numOfElements}; + error_ = _wrapper->clEnqueueNDRangeKernel(contextData_[i].clCmdQueue, + contextData_[i].clKernel, 1, NULL, + gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed (%d)", + error_); + + error_ = _wrapper->clEnqueueReleaseGLObjects(contextData_[i].clCmdQueue, 1, + &(contextData_[i].inputBuffer), + 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), + "clEnqueueReleaseGLObjects failed (%d)", error_); + + error_ = _wrapper->clEnqueueReleaseGLObjects( + contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 0, NULL, + NULL); + CHECK_RESULT((error_ != CL_SUCCESS), + "clEnqueueReleaseGLObjects failed (%d)", error_); + + error_ = _wrapper->clFinish(contextData_[i].clCmdQueue); + CHECK_RESULT((error_ != CL_SUCCESS), "clFinish() failed (%d)", error_); + + glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer); + void* glMem = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY); + memcpy(inOutData, glMem, c_numOfElements * sizeof(cl_uint4)); + glUnmapBuffer(GL_ARRAY_BUFFER); + + _wrapper->clReleaseMemObject(contextData_[i].inputBuffer); + _wrapper->clReleaseMemObject(contextData_[i].outputBuffer); + + // Delete GL buffers + glBindBuffer(GL_ARRAY_BUFFER, 0); + glDeleteBuffers(1, &inGLBuffer); + inGLBuffer = 0; + glDeleteBuffers(1, &outGLBuffer); + outGLBuffer = 0; + } + + // Compare expected output with actual data received + for (unsigned int i = 0; i < c_numOfElements; i++) { + for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) { + CHECK_RESULT((inOutData[i].s[j] != expectedData[i].s[j]), + "Element %d is incorrect!\n\t \ + expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}", + i, expectedData[i].s[0], expectedData[i].s[1], + expectedData[i].s[2], expectedData[i].s[3], + inOutData[i].s[0], inOutData[i].s[1], inOutData[i].s[2], + inOutData[i].s[3]); + } + } +} + +unsigned int OCLGLMultiContext::close() { + for (unsigned int i = 0; i < c_glContextCount; i++) { + makeCurrent(contextData_[i].glContext); + _wrapper->clReleaseKernel(contextData_[i].clKernel); + _wrapper->clReleaseProgram(contextData_[i].clProgram); + _wrapper->clReleaseCommandQueue(contextData_[i].clCmdQueue); + _wrapper->clReleaseContext(contextData_[i].clContext); + destroyGLContext(contextData_[i].glContext); + } + return OCLGLCommon::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMultiContext.h b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMultiContext.h new file mode 100644 index 0000000000..14983339f3 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLMultiContext.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_GL_MULTI_CONTEXT_H_ +#define _OCL_GL_MULTI_CONTEXT_H_ + +#include "OCLGLCommon.h" + +class OCLGLMultiContext : public OCLGLCommon { + public: + OCLGLMultiContext(); + virtual ~OCLGLMultiContext(); + + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId); + virtual void run(void); + virtual unsigned int close(void); + + private: + static const unsigned int c_glContextCount = 3; + static const unsigned int c_numOfElements = 128; + + struct GLContextDataSet { + OCLGLHandle glContext; + cl_context clContext; + cl_command_queue clCmdQueue; + cl_program clProgram; + cl_kernel clKernel; + cl_mem inputBuffer; + cl_mem outputBuffer; + }; + GLContextDataSet contextData_[c_glContextCount]; + + bool failed_; +}; + +#endif // _OCL_GL_MULTI_CONTEXT_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLTexture.cpp b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLTexture.cpp new file mode 100644 index 0000000000..8b5a658893 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLTexture.cpp @@ -0,0 +1,144 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLGLTexture.h" + +#include +#include +#include +#include + +const static char* strKernelui = + "__kernel void gltexture_test(read_only image2d_t source, write_only " + "image2d_t dest) \n" + "{ " + " \n" + " int tidX = get_global_id(0); " + " \n" + " int tidY = get_global_id(1); " + " \n" + " uint4 pixel = read_imageui(source, (int2)(tidX, tidY)); " + " \n" + " write_imageui(dest, (int2)(tidX, tidY), pixel); " + " \n" + "}"; + +const static char* strKernelf = + "__kernel void gltexture_test(read_only image2d_t source, write_only " + "image2d_t dest) \n" + "{ " + " \n" + " int tidX = get_global_id(0); " + " \n" + " int tidY = get_global_id(1); " + " \n" + " float4 pixel = read_imagef(source, (int2)(tidX, tidY)); " + " \n" + " write_imagef(dest, (int2)(tidX, tidY), pixel); " + " \n" + "} " + " \n"; + +OCLGLTexture::OCLGLTexture() + : inDataGL_(NULL), outDataGL_(NULL), inGLTexture_(0), outGLTexture_(0) { + _numSubTests = 4 * 2; +} + +OCLGLTexture::~OCLGLTexture() {} + +void OCLGLTexture::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + // Initialize random number seed + srand((unsigned int)time(NULL)); + + OCLGLCommon::open(test, units, conversion, deviceId); + if (_errorFlag) return; + + currentTest_ = test % 4; + testRender_ = ((test / 4) >= 1) ? true : false; + + // Build the kernel + if (0 == currentTest_) { + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernelui, + NULL, &error_); + + } else { + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernelf, + NULL, &error_); + } + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateProgramWithSource() failed (%d)", error_); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_); + + kernel_ = _wrapper->clCreateKernel(program_, "gltexture_test", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_); +} + +void OCLGLTexture::run(void) { + bool retVal = false; + switch (currentTest_) { + case 0: + retVal = runTextureTest(GL_RGBA32UI, GL_RGBA_INTEGER, + GL_UNSIGNED_INT); + break; + case 1: + retVal = + runTextureTest(GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE); + break; + case 2: + retVal = runTextureTest(GL_RGBA16, GL_RGBA, GL_SHORT); + break; + case 3: + retVal = runTextureTest(GL_RGBA32F, GL_RGBA, GL_FLOAT); + break; + default: + CHECK_RESULT(true, "unsupported test number\n"); + } + CHECK_RESULT((retVal != true), "cl-gl texture interop test failed "); +} + +unsigned int OCLGLTexture::close(void) { + clReleaseMemObject(buffers_[0]); + clReleaseMemObject(buffers_[1]); + buffers_.clear(); + // Delete GL in & out buffers + glFinish(); + glBindTexture(GL_TEXTURE_2D, 0); + glDeleteTextures(1, &inGLTexture_); + inGLTexture_ = 0; + glDeleteTextures(1, &outGLTexture_); + outGLTexture_ = 0; + + free(inDataGL_); + inDataGL_ = NULL; + free(outDataGL_); + outDataGL_ = NULL; + return OCLGLCommon::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/gl/OCLGLTexture.h b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLTexture.h new file mode 100644 index 0000000000..412eddbb37 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/gl/OCLGLTexture.h @@ -0,0 +1,214 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_GL_TEXTURE_H_ +#define _OCL_GL_TEXTURE_H_ + +#include + +#include "OCLGLCommon.h" + +class OCLGLTexture : public OCLGLCommon { + public: + static const unsigned int c_imageWidth = 512; + static const unsigned int c_imageHeight = 512; + static const unsigned int c_elementsPerPixel = 4; + + OCLGLTexture(); + virtual ~OCLGLTexture(); + + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId); + virtual void run(void); + virtual unsigned int close(void); + + private: + unsigned int currentTest_; + void* inDataGL_; + void* outDataGL_; + GLuint inGLTexture_; + GLuint outGLTexture_; + bool testRender_; + template + bool runTextureTest(GLint internalFormat, GLenum format, GLenum type); +}; + +template +bool OCLGLTexture::runTextureTest(GLint internalFormat, GLenum format, + GLenum type) { + cl_mem image; + inDataGL_ = + malloc(c_imageWidth * c_imageHeight * c_elementsPerPixel * sizeof(T)); + outDataGL_ = + malloc(c_imageWidth * c_imageHeight * c_elementsPerPixel * sizeof(T)); + + // Initialize input data with random values + T* inputIterator = (T*)inDataGL_; + for (unsigned int i = 0; + i < c_imageWidth * c_imageHeight * c_elementsPerPixel; i++) { + inputIterator[i] = (T)(rand() % 255); + } + // Initialize output data with zeros + memset(outDataGL_, 0, + c_imageWidth * c_imageHeight * c_elementsPerPixel * sizeof(T)); + + // Generate and Bind in & out OpenGL textures + glGenTextures(1, &inGLTexture_); + glGenTextures(1, &outGLTexture_); + + glBindTexture(GL_TEXTURE_2D, inGLTexture_); + glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glTexImage2D(GL_TEXTURE_2D, 0, internalFormat, (GLsizei)c_imageWidth, + (GLsizei)c_imageHeight, 0, format, type, inDataGL_); + + glBindTexture(GL_TEXTURE_2D, outGLTexture_); + glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glTexImage2D(GL_TEXTURE_2D, 0, internalFormat, (GLsizei)c_imageWidth, + (GLsizei)c_imageHeight, 0, format, type, outDataGL_); + + glFinish(); + + // Create input buffer from GL input texture + image = _wrapper->clCreateFromGLTexture( + context_, CL_MEM_READ_ONLY, GL_TEXTURE_2D, 0, inGLTexture_, &error_); + if (error_ != CL_SUCCESS) { + printf("Unable to create input buffer from GL texture (%d)", error_); + return false; + } + buffers_.push_back(image); + + // Create output buffer from GL output texture + image = _wrapper->clCreateFromGLTexture( + context_, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, outGLTexture_, &error_); + if (error_ != CL_SUCCESS) { + printf("Unable to create output buffer from GL texture (%d)", error_); + return false; + } + buffers_.push_back(image); + size_t gws[2] = {c_imageWidth, c_imageHeight}; + + // Assign args + for (unsigned int i = 0; i < buffers_.size(); i++) { + error_ = + _wrapper->clSetKernelArg(kernel_, i, sizeof(cl_mem), &buffers()[i]); + if (error_ != CL_SUCCESS) { + printf("clSetKernelArg() failed (%d)", error_); + return false; + } + } + + int loop = (testRender_) ? 2 : 1; + for (int l = 0; l < loop; ++l) { + if (testRender_ && (l == 0)) { + GLuint FrameBufferName = 0; + glGenFramebuffers(1, &FrameBufferName); + glBindFramebuffer(GL_FRAMEBUFFER, FrameBufferName); + glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, inGLTexture_, + 0); + glClearColor(.5f, 1.f, 1.0f, 0); + glClear(GL_COLOR_BUFFER_BIT); + glFinish(); + } + + error_ = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 2, + &buffers()[0], 0, NULL, NULL); + if (error_ != CL_SUCCESS) { + printf("Unable to acquire GL objects (%d)", error_); + return false; + } + + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, + NULL, gws, NULL, 0, NULL, NULL); + if (error_ != CL_SUCCESS) { + printf("clEnqueueNDRangeKernel() failed (%d)", error_); + return false; + } + + error_ = _wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 2, + &buffers()[0], 0, NULL, NULL); + if (error_ != CL_SUCCESS) { + printf("clEnqueueReleaseGLObjects failed (%d)", error_); + return false; + } + + error_ = _wrapper->clFinish(cmdQueues_[_deviceId]); + if (error_ != CL_SUCCESS) { + printf("clFinish() failed (%d)", error_); + return false; + } + + if (testRender_ && (l == 0)) { + glClearColor(1.f, 1.f, 1.f, 1.f); + glClear(GL_COLOR_BUFFER_BIT); + glFinish(); + } + } + + // Get the results from GL texture + glBindTexture(GL_TEXTURE_2D, outGLTexture_); + glActiveTexture(GL_TEXTURE0); + glGetTexImage(GL_TEXTURE_2D, 0, format, type, outDataGL_); + + // Check output texture data + inputIterator = (T*)inDataGL_; + T* outputIterator = (T*)outDataGL_; + T color; + switch (type) { + case GL_UNSIGNED_INT: + color = (T)0x3f800000; + break; + case GL_UNSIGNED_BYTE: + color = (T)0xff; + break; + case GL_SHORT: + color = (T)0x7fff; + break; + case GL_FLOAT: + color = (T)1.f; + break; + default: + return false; + } + for (unsigned int i = 0; + i < c_imageWidth * c_imageHeight * c_elementsPerPixel; i++) { + if (testRender_) { + if (outputIterator[i] != color) { + std::cout << "Element " << i + << " in output texture is incorrect! (internal format = " + << internalFormat << "\n\t expected:" << inputIterator[i] + << " differs from actual clear color:" << color << std::endl; + return false; + } + } else if (inputIterator[i] != outputIterator[i]) { + std::cout << "Element " << i + << " in output texture is incorrect! (internal format = " + << internalFormat << "\n\t expected:" << inputIterator[i] + << " differs from actual: " << outputIterator[i] << std::endl; + return false; + } + } + return true; +} + +#endif // _OCL_GL_TEXTURE_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/gl/TestList.cpp b/projects/clr/opencl/tests/ocltst/module/gl/TestList.cpp new file mode 100644 index 0000000000..7a58fc06c6 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/gl/TestList.cpp @@ -0,0 +1,54 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLTestListImp.h" + +// +// Includes for tests +// +#include "OCLGLBuffer.h" +#include "OCLGLBufferMultipleQueues.h" +#include "OCLGLDepthBuffer.h" +#include "OCLGLDepthTex.h" +#include "OCLGLFenceSync.h" +#include "OCLGLMsaaTexture.h" +#include "OCLGLMultiContext.h" +#include "OCLGLTexture.h" + +// +// Helper macro for adding tests +// +template +static void* dictionary_CreateTestFunc(void) { + return new T(); +} + +#define TEST(name) \ + { #name, &dictionary_CreateTestFunc < name> } + +TestEntry TestList[] = { + TEST(OCLGLBuffer), TEST(OCLGLBufferMultipleQueues), + TEST(OCLGLTexture), TEST(OCLGLMultiContext), + TEST(OCLGLFenceSync), TEST(OCLGLDepthTex), +}; + +unsigned int TestListCount = sizeof(TestList) / sizeof(TestList[0]); +unsigned int TestLibVersion = 0; +const char* TestLibName = "oclgl"; diff --git a/projects/clr/opencl/tests/ocltst/module/gl/oclgl.exclude b/projects/clr/opencl/tests/ocltst/module/gl/oclgl.exclude new file mode 100644 index 0000000000..39345e8fd7 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/gl/oclgl.exclude @@ -0,0 +1 @@ +# all clear diff --git a/projects/clr/opencl/tests/ocltst/module/include/BaseTestImp.h b/projects/clr/opencl/tests/ocltst/module/include/BaseTestImp.h new file mode 100644 index 0000000000..92e730d534 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/include/BaseTestImp.h @@ -0,0 +1,206 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _BaseTestImp_H_ +#define _BaseTestImp_H_ + +#include +#include +#include +#include +#include + +#include "OCLTest.h" +#include "OCLWrapper.h" + +#define EXIT_SILENT_FAILURE 2 +#define KERNEL(...) #__VA_ARGS__ + +#ifdef _MSC_VER +#define snprintf sprintf_s +#endif + +#define CHECK_ERROR(error, msg) \ + if (error != CL_SUCCESS) { \ + _errorFlag = true; \ + printf("\n\n%s\nError code: %d\n\n", msg, error); \ + _errorMsg = msg; \ + _crcword += 1; \ + return; \ + } + +#define CHECK_ERROR_NO_RETURN(error, msg) \ + if (error != CL_SUCCESS) { \ + _errorFlag = true; \ + printf("\n\n%s\nError code: %d\n\n", msg, error); \ + _errorMsg = msg; \ + _crcword += 1; \ + } + +#define CHECK_RESULT(test, msg, ...) \ + if ((test)) { \ + char* buf = (char*)malloc(4096); \ + _errorFlag = true; \ + int rc = snprintf(buf, 4096, msg, ##__VA_ARGS__); \ + assert(rc >= 0 && rc < (int)4096); \ + printf("%s:%d - %s\n", __FILE__, __LINE__, buf); \ + _errorMsg = std::string(buf); \ + _crcword += 1; \ + free(buf); \ + return; \ + } + +#define CHECK_RESULT_ARGS CHECK_RESULT + +#define CHECK_RESULT_NO_RETURN(test, msg, ...) \ + if ((test)) { \ + char* buf = (char*)malloc(4096); \ + _errorFlag = true; \ + int rc = snprintf(buf, 4096, msg, ##__VA_ARGS__); \ + assert(rc >= 0 && rc < (int)4096); \ + printf("%s:%d - %s\n", __FILE__, __LINE__, buf); \ + _errorMsg = std::string(msg); \ + _crcword += 1; \ + free(buf); \ + } + +#define CHECK_RESULT_NO_RETURN_ARGS CHECK_RESULT_NO_RETURN + +#define CHECK_RESULT_SHUTDOWN(test, msg) \ + if ((test)) { \ + _errorFlag = true; \ + printf("%s\n", msg); \ + _errorMsg = msg; \ + _crcword += 1; \ + close(); \ + return; \ + } + +#define CHECK_RESULT_CL(test, msg) \ + if ((test)) { \ + _errorFlag = true; \ + printf("%s\n", msg); \ + _errorMsg = msg; \ + _crcword += 1; \ + return 1; \ + } + +class BaseTestImp : public OCLTest { + public: + BaseTestImp(); + virtual ~BaseTestImp(); + + public: + virtual unsigned int getThreadUsage(void); + virtual int getNumSubTests(void); + + //! Abstract functions being defined here + virtual void open(); + virtual void open(unsigned int test, const char* deviceName, + unsigned int architecture); + + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId, unsigned int platformIndex) { + return open(test, "Tahiti", platformIndex); + } + + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + return open(test, "Tahiti", 0); + } + + virtual void run(void) = 0; + virtual unsigned int close(void); + + //! Functions to set class members + virtual void checkComplib(unsigned int test, const char* deviceName, + unsigned int architecture); + virtual void setDeviceName(const char*); + virtual const char* getDeviceName(); + virtual void setErrorMsg(const char* error); + virtual const char* getErrorMsg(void); + virtual bool hasErrorOccured(void); + virtual void clearError(); + BaseTestImp* toBaseTestImp() { return this; } + virtual OCLTestImp* toOCLTestImp() { return NULL; } + virtual void useCPU() { _cpu = true; } + virtual void setIterationCount(int cnt); + virtual void setDeviceId(unsigned int deviceId); + virtual unsigned int getDeviceId(); + virtual void setPlatformIndex(unsigned int platformIndex); + virtual unsigned int getPlatformIndex(); + virtual float getPerfInfo(); + virtual void clearPerfInfo(); + + protected: + unsigned int _numSubTests; + unsigned int _openTest; + unsigned int _useThreads; + int _iterationCnt; + float _perfInfo; + bool _cpu; + + unsigned int _crcword; + unsigned int _crctab[256]; + + bool _errorFlag; + std::string _errorMsg; + + const char* _deviceName; + unsigned int _architecture; + unsigned int _deviceId; + unsigned int _platformIndex; + bool failed_ = false; + cl_int error_; + cl_uint type_; + cl_uint deviceCount_; + cl_device_id* devices_; + cl_context context_; + + cl_program program_; + cl_kernel kernel_; +}; + +// enum to keep track of different memory types +enum MemType { LOOCL, REMOTE_CACHED, REMOTE_UNCACHED }; + +class DataType { + cl_image_format f; + const char* str; + unsigned int size; + + public: + DataType() {} + + DataType(cl_image_format f, const char* str, unsigned int size) { + this->f = f; + this->str = str; + this->size = size; + } + operator const char*() { return str; } + + operator unsigned int() { return size; } + operator cl_image_format() { return f; } +}; + +// useful for initialization of an array of data types for a test +#define DTYPE(x, y) DataType(x, #x, (unsigned int)y) + +#endif diff --git a/projects/clr/opencl/tests/ocltst/module/include/OCLTestImp.h b/projects/clr/opencl/tests/ocltst/module/include/OCLTestImp.h new file mode 100644 index 0000000000..fe32e08efa --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/include/OCLTestImp.h @@ -0,0 +1,83 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCLTestImp_H_ +#define _OCLTestImp_H_ + +#include +#include + +#include "BaseTestImp.h" +#include "CL/cl.h" +#include "OCL/Thread.h" +#include "OCLTest.h" +#include "OCLWrapper.h" + +class OCLTestImp : public BaseTestImp { + public: + OCLTestImp(); + virtual ~OCLTestImp(); + + public: + //! Abstract functions being defined here + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId, unsigned int platformIndex); + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId); + virtual void run(void) = 0; + virtual unsigned int close(void); + //! Functions to set class members + + public: + void useCPU(); + int genIntRand(int a, int b); + int genBitRand(int n); + void accumulateCRC(const void* buffer, int len); + void setOCLWrapper(OCLWrapper* wrapper); + OCLTestImp* toOCLTestImp() { return this; } + + static OCLutil::Lock openDeviceLock; + static OCLutil::Lock compileLock; + + protected: + const std::vector& buffers() const { return buffers_; } + + OCLWrapper* _wrapper; + + int _seed; + + // Common data of any CL program + cl_int error_; + cl_uint type_; + cl_uint deviceCount_; + cl_device_id* devices_; + cl_platform_id platform_; + std::vector cmdQueues_; + cl_context context_; + + cl_program program_; + cl_kernel kernel_; + std::vector buffers_; +}; + +// useful for initialization of an array of data types for a test +#define DTYPE(x, y) DataType(x, #x, (unsigned int)y) + +#endif diff --git a/projects/clr/opencl/tests/ocltst/module/include/OCLTestListImp.h b/projects/clr/opencl/tests/ocltst/module/include/OCLTestListImp.h new file mode 100644 index 0000000000..5dfa6ffd13 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/include/OCLTestListImp.h @@ -0,0 +1,86 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef __Dictionary_h__ +#define __Dictionary_h__ + +// +// Testing module (plugin) interface forward declarations +// +#ifdef ATI_OS_WIN +#define OCL_DLLEXPORT __declspec(dllexport) +#define OCL_CALLCONV __cdecl +#endif +#ifdef ATI_OS_LINUX +#define OCL_DLLEXPORT +#define OCL_CALLCONV +#endif + +class OCLTest; + +// +// OCLTestList_TestCount - retrieve the number of tests in the testing module +// +extern "C" OCL_DLLEXPORT unsigned int OCL_CALLCONV OCLTestList_TestCount(void); + +// +// OCLTestList_TestLibVersion - retrieve the version of test lib in the testing +// module +// +extern "C" OCL_DLLEXPORT unsigned int OCL_CALLCONV +OCLTestList_TestLibVersion(void); + +// +// OCLTestList_TestLibName - retrieve the name of test library +// +extern "C" OCL_DLLEXPORT const char* OCL_CALLCONV OCLTestList_TestLibName(void); + +// +// OCLTestList_TestName - retrieve the name of the indexed test in the module +// +extern "C" OCL_DLLEXPORT const char* OCL_CALLCONV +OCLTestList_TestName(unsigned int testNum); + +// +// OCLTestList_CreateTest - create a test by index +// +extern "C" OCL_DLLEXPORT OCLTest* OCL_CALLCONV +OCLTestList_CreateTest(unsigned int testNum); + +// +// OCLTestList_DestroyTest - destroy a test object +// +extern "C" OCL_DLLEXPORT void OCL_CALLCONV +OCLTestList_DestroyTest(OCLTest* test); + +// +// internal global data that is populated in each dll +// +typedef struct _TestEntry { + const char* name; + void* (*create)(void); +} TestEntry; + +extern TestEntry TestList[]; +extern unsigned int TestListCount; +extern unsigned int TestLibVersion; +extern const char* TestLibName; + +#endif diff --git a/projects/clr/opencl/tests/ocltst/module/include/OclIncludes.h b/projects/clr/opencl/tests/ocltst/module/include/OclIncludes.h new file mode 100644 index 0000000000..50adba1c8c --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/include/OclIncludes.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_INCLUDES_H +#define _OCL_INCLUDES_H + +#ifdef ATI_OS_WIN +#define POINTER_64 __ptr64 +#include +#include "d3d9.h" +#endif + +#include "CL/cl.h" + +#endif //_OCL_INCLUDES_H diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerf3DImageWriteSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerf3DImageWriteSpeed.cpp new file mode 100644 index 0000000000..4121c15911 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerf3DImageWriteSpeed.cpp @@ -0,0 +1,211 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerf3DImageWriteSpeed.h" + +#include +#include +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define KERNEL_CODE(...) #__VA_ARGS__ + +#define NUM_SIZES 4 +static const unsigned int Sizes[NUM_SIZES] = {64, 128, 256, 512}; + +#define NUM_FORMATS 1 +static const cl_image_format formats[NUM_FORMATS] = { + {CL_RGBA, CL_UNSIGNED_INT8}}; +static const char *textFormats[NUM_FORMATS] = {"CL_RGBA , CL_UNSIGNED_INT8"}; +static const unsigned int formatSize[NUM_FORMATS] = {sizeof(CL_UNSIGNED_INT8)}; + +const static char *strKernel = {KERNEL_CODE( + \n __kernel void image_kernel(write_only image3d_t input) { + size_t x = get_global_id(0); + size_t y = get_global_id(1); + size_t z = get_global_id(2); + + int4 coords = (int4)(x, y, z, 0); + write_imageui(input, coords, (1, 1, 1, 1)); +} + \n)}; + +OCLPerf3DImageWriteSpeed::OCLPerf3DImageWriteSpeed() { + _numSubTests = NUM_SIZES * NUM_FORMATS; +} + +OCLPerf3DImageWriteSpeed::~OCLPerf3DImageWriteSpeed() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerf3DImageWriteSpeed::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + error_ = CL_SUCCESS; + testId_ = test; + + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + program_ = 0; + kernel_ = 0; + cmd_queue_ = 0; + imageBuffer_ = 0; + skip_ = false; + + char charbuf[1024]; + size_t retsize; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_EXTENSIONS, + 1024, charbuf, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + if (!strstr(charbuf, "cl_khr_3d_image_writes")) { + skip_ = true; + testDescString = "3D Write not supported. Test Skipped."; + return; + } + + bufSize_ = Sizes[test % NUM_SIZES]; + bufnum_ = (test / NUM_SIZES) % NUM_FORMATS; + memSize_ = bufSize_ * bufSize_ * bufSize_ * formatSize[bufnum_]; + + cmd_queue_ = cmdQueues_[_deviceId]; + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "image_kernel", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + imageBuffer_ = _wrapper->clCreateImage3D( + context_, CL_MEM_WRITE_ONLY, &formats[bufnum_], bufSize_, bufSize_, + bufSize_, 0, 0, NULL, &error_); + CHECK_RESULT(imageBuffer_ == 0, "clCreateImage(imageBuffer_) failed"); + + // set kernel arguments + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &imageBuffer_); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); +} + +void OCLPerf3DImageWriteSpeed::run(void) { + if (skip_) { + return; + } + + CPerfCounter timer; + unsigned int fmt_num = (testId_ / NUM_SIZES) % NUM_FORMATS; + + size_t gws[3] = {bufSize_, bufSize_, bufSize_}; + size_t lws[3] = {8, 8, 4}; + + // warm up + error_ = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, kernel_, 3, NULL, gws, + lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + _wrapper->clFinish(cmd_queue_); + + // checkData + char *bufptr = (char *)malloc(memSize_); + + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {bufSize_, bufSize_, bufSize_}; + size_t image_row_pitch = bufSize_ * formatSize[bufnum_]; + size_t image_slice_pitch = image_row_pitch * bufSize_; + error_ = clEnqueueReadImage(cmd_queue_, imageBuffer_, true, origin, region, + image_row_pitch, image_slice_pitch, bufptr, 0, + NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadImage() failed"); + + for (size_t i = 0; i < bufSize_ * bufSize_ * bufSize_ * 4; ++i) { + if (bufptr[i] != 1) { + printf("(%4dx%4dx%4d) fmt:%s(%1u) checkData() fail, image_ptr[%u] = %d\n", + bufSize_, bufSize_, bufSize_, textFormats[fmt_num], + formatSize[bufnum_], (unsigned int)i, (int)bufptr[i]); + CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n"); + char buf[256]; + SNPRINTF(buf, sizeof(buf), + " (%4dx%4dx%4d) fmt:%s(%1d) checkData() FAILED! ", bufSize_, + bufSize_, bufSize_, textFormats[fmt_num], formatSize[bufnum_]); + testDescString = buf; + return; + } + } + delete bufptr; + + // test begins + unsigned int numIter = 5; + + timer.Reset(); + timer.Start(); + + for (unsigned int i = 0; i < numIter; ++i) { + error_ = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, kernel_, 3, NULL, gws, + lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + _wrapper->clFinish(cmd_queue_); + } + + timer.Stop(); + + double sec = timer.GetElapsedTime(); + + // write_image speed in GB/s + double perf = ((double)memSize_ * numIter * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%3dx%3dx%3d) fmt:%s(%1u) i: %2d (GB/s) ", + bufSize_, bufSize_, bufSize_, textFormats[fmt_num], + formatSize[bufnum_], numIter); + testDescString = buf; +} + +unsigned int OCLPerf3DImageWriteSpeed::close(void) { + if (!skip_) { + if (imageBuffer_) { + error_ = _wrapper->clReleaseMemObject(imageBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(imageBuffer_) failed"); + } + } + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerf3DImageWriteSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerf3DImageWriteSpeed.h new file mode 100644 index 0000000000..eb6e9ce12c --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerf3DImageWriteSpeed.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_3DImageWriteSpeed_H_ +#define _OCL_3DImageWriteSpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerf3DImageWriteSpeed : public OCLTestImp { + public: + OCLPerf3DImageWriteSpeed(); + virtual ~OCLPerf3DImageWriteSpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + cl_command_queue cmd_queue_; + cl_mem imageBuffer_; + + unsigned int bufSize_; + unsigned int bufnum_; + char* memptr; + unsigned int memSize_; + unsigned int testId_; + + bool skip_; +}; + +#endif // _OCL_3DImageWriteSpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAES256.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAES256.cpp new file mode 100644 index 0000000000..599d2cec33 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAES256.cpp @@ -0,0 +1,451 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfAES256.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +static const char *aes256_kernel = + "// NOTE: THIS KERNEL WAS ADOPTED FROM SISOFT SANDRA: DO NOT " + "REDISTRIBUTE!!\n" + "inline uint Load(__global uint* pData, const uint iX, const uint iY)\n" + "{\n" + " return pData[iX | (iY << 8)];\n" + "}\n" + "\n" + "\n" + "inline uint4 Load4(__global uint* pData, const uint4 uX, const uint iY)\n" + "{\n" + " uint uExtent = iY << 8;\n" + " uint4 uNdx = uX + uExtent;\n" + " \n" + " return (uint4)(pData[uNdx.x], pData[uNdx.y], pData[uNdx.z], " + "pData[uNdx.w]);\n" + "}\n" + "\n" + "\n" + "__kernel \n" + "__attribute__((vec_type_hint(uint4))) \n" + "void CryptThread(__global uint4* pInput, __global uint4* pOutput,\n" + " __global uint* pTables,\n" + " __global uint4* pKey, const uint iRounds)\n" + "{\n" + " const uint iNdx = get_global_id(0);\n" + " \n" + " uint4 state, istate, tstate;\n" + " state = pInput[iNdx] ^ pKey[iRounds];\n" + " \n" + " for (uint i = iRounds-1; i; i--)\n" + " {\n" + " istate = state & 0xFF;\n" + " tstate = Load4(pTables, istate.xyzw, 0);\n" + "\n" + " istate = (state >> 8) & 0xFF;\n" + " tstate^= Load4(pTables, istate.wxyz, 1);\n" + "\n" + " istate = (state >> 16) & 0xFF;\n" + " tstate^= Load4(pTables, istate.zwxy, 2);\n" + "\n" + " istate = state >> 24;\n" + " tstate^= Load4(pTables, istate.yzwx, 3);\n" + "\n" + " state = tstate ^ pKey[i];\n" + " }\n" + "\n" + " istate = state & 0xFF;\n" + " tstate = Load4(pTables, istate.xyzw, 4);\n" + "\n" + " istate = (state >> 8) & 0xFF;\n" + " tstate |= Load4(pTables, istate.wxyz, 4) << 8;\n" + "\n" + " istate = (state >> 16) & 0xFF;\n" + " tstate |= Load4(pTables, istate.zwxy, 4) << 16;\n" + "\n" + " istate = state >> 24;\n" + " tstate |= Load4(pTables, istate.yzwx, 4) << 24;\n" + "\n" + " pOutput[iNdx] = tstate ^ pKey[0];\n" + "}\n"; + +static const char *aes256_kernel2 = + "// NOTE: THIS KERNEL WAS ADOPTED FROM SISOFT SANDRA: DO NOT " + "REDISTRIBUTE!!\n" + "#define AES_BLOCK_SIZE 16\n" + "#define AES_TABLE_SIZE 256\n" + "\n" + "#define AES_TABLE_MAX 5\n" + "#define AES_CONST_SIZE (AES_TABLE_SIZE*AES_TABLE_MAX)\n" + "\n" + "#define AES_ROUND_128 10\n" + "#define AES_ROUND_192 12\n" + "#define AES_ROUND_256 14\n" + "#define AES_ROUNDKEY_MAX (AES_BLOCK_SIZE/4*(AES_ROUND_256+1))\n" + "#define _IS_GPU_\n" + "\n" + "\n" + "inline uint Load(\n" + "#ifdef _IS_GPU_\n" + " __local uint* pData,\n" + "#else\n" + " __constant uint* pData,\n" + "#endif\n" + " const uint iX, const uint iY)\n" + "{\n" + " const uint uNdx = iX + iY*AES_TABLE_SIZE;\n" + " return pData[uNdx];\n" + "}\n" + "\n" + "\n" + "inline uint4 Load4(\n" + "#ifdef _IS_GPU_\n" + " __local uint* pData,\n" + "#else\n" + " __constant uint* pData,\n" + "#endif\n" + " const uint4 uX, const uint iY)\n" + "{\n" + " const uint uExtent = iY*AES_TABLE_SIZE;\n" + " const uint4 uNdx = uX + uExtent;\n" + " \n" + " return (uint4)(pData[uNdx.x], pData[uNdx.y], pData[uNdx.z], " + "pData[uNdx.w]);\n" + "}\n" + "\n" + "\n" + "__kernel \n" + "__attribute__((vec_type_hint(uint4)))\n" + "#ifdef KERNEL_MAX_THREADS\n" + "__attribute__((work_group_size_hint(KERNEL_MAX_THREADS, 1, 1)))\n" + "#endif\n" + "void CryptThread(__global const uint4* pInput, __global uint4* pOutput,\n" + " __constant uint* pTables,\n" + " __constant uint4* pKey, const uint iRounds)\n" + "{\n" + " const size_t iNdx = get_global_id(0);\n" + "\n" + "#ifdef _IS_GPU_\n" + " #define Load4T(x, y) Load4(ulTables, x, y)\n" + "\n" + " __local uint ulTables[AES_CONST_SIZE];\n" + "\n" + " const uint iLdx = get_local_id(0);\n" + " if (iLdx < AES_TABLE_SIZE) {\n" + " const uint iGrps = get_local_size(0);\n" + " const uint iLSize = min(iGrps, (uint)AES_TABLE_SIZE);\n" + " const uint iBpL = AES_CONST_SIZE/iLSize;\n" + "\n" + " const uint iStart = iLdx*iBpL;\n" + " const uint iEnd = iStart + iBpL;\n" + "\n" + " for (uint i=iStart; i> 8) & 0xFF;\n" + " tstate^= Load4T(istate.yzwx, 1);\n" + "\n" + " istate = (state >> 16) & 0xFF;\n" + " tstate^= Load4T(istate.zwxy, 2);\n" + "\n" + " istate = state >> 24;\n" + " tstate^= Load4T(istate.wxyz, 3);\n" + "\n" + " state = tstate ^ pKey[i];\n" + " }\n" + "\n" + " istate = state & 0xFF;\n" + " tstate = Load4T(istate.xyzw, 4);\n" + "\n" + " istate = (state >> 8) & 0xFF;\n" + " tstate |= Load4T(istate.yzwx, 4) << 8;\n" + "\n" + " istate = (state >> 16) & 0xFF;\n" + " tstate |= Load4T(istate.zwxy, 4) << 16;\n" + "\n" + " istate = state >> 24;\n" + " tstate |= Load4T(istate.wxyz, 4) << 24;\n" + "\n" + " pOutput[iNdx] = tstate ^ pKey[iRounds];\n" + "}\n"; + +OCLPerfAES256::OCLPerfAES256() { _numSubTests = 2; } + +OCLPerfAES256::~OCLPerfAES256() {} + +void OCLPerfAES256::setData(cl_mem buffer, unsigned int val) { + unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL, + &error_); + for (unsigned int i = 0; i < bufSize_ / sizeof(unsigned int); i++) + data[i] = val; + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); + _wrapper->clFinish(cmd_queue_); +} + +void OCLPerfAES256::checkData(cl_mem buffer) { + unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL, + &error_); + for (unsigned int i = 0; i < bufSize_ / sizeof(unsigned int); i++) { + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); + _wrapper->clFinish(cmd_queue_); +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfAES256::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + program_ = 0; + kernel_ = 0; + inBuffer_ = 0; + outBuffer_ = 0; + tableBuffer_ = 0; + keyBuffer_ = 0; + blockSize_ = 1024; + maxIterations = 50; + + bufSize_ = 5592320 * sizeof(cl_uint4); + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + delete platforms; + } + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, + "Couldn't find platform with GPU devices, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + char charbuf[1024]; + size_t retsize; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024, + charbuf, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + // Increase iterations for devices with many CUs + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(size_t), &numCUs, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + maxIterations *= (unsigned int)(1 + 10 * numCUs / 20); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + inBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, bufSize_, + NULL, &error_); + CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed"); + + outBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, bufSize_, + NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + tableBuffer_ = + _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, 5120, NULL, &error_); + CHECK_RESULT(tableBuffer_ == 0, "clCreateBuffer(tableBuffer) failed"); + + keyBuffer_ = + _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, 240, NULL, &error_); + CHECK_RESULT(keyBuffer_ == 0, "clCreateBuffer(keyBuffer) failed"); + + if (_openTest == 0) { + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&aes256_kernel, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + testDescString += "orig"; + } else { + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&aes256_kernel2, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + testDescString += " new"; + } + + const char *buildOps = NULL; + error_ = _wrapper->clBuildProgram(program_, 1, &device, buildOps, NULL, NULL); + + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "CryptThread", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + + cl_uint rounds = 14; + + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&inBuffer_); + error_ = + _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_); + error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_mem), + (void *)&tableBuffer_); + error_ = + _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_mem), (void *)&keyBuffer_); + error_ = + _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_uint), (void *)&rounds); + setData(inBuffer_, 0xdeadbeef); + setData(outBuffer_, 0xdeadbeef); +} + +void OCLPerfAES256::run(void) { + int global = bufSize_ / sizeof(cl_uint4); + int local = 64; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + + CPerfCounter timer; + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < maxIterations; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + } + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + _wrapper->clFinish(cmd_queue_); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // No idea what data should be in here + // checkData(outBuffer_); + // Compute GB/s + double perf = + ((double)bufSize_ * (double)maxIterations * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; +} + +unsigned int OCLPerfAES256::close(void) { + _wrapper->clFinish(cmd_queue_); + + if (inBuffer_) { + error_ = _wrapper->clReleaseMemObject(inBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(inBuffer_) failed"); + } + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (tableBuffer_) { + error_ = _wrapper->clReleaseMemObject(tableBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(tableBuffer_) failed"); + } + if (keyBuffer_) { + error_ = _wrapper->clReleaseMemObject(keyBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(keyBuffer_) failed"); + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAES256.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAES256.h new file mode 100644 index 0000000000..2d7dc0b22d --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAES256.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_AES256_H_ +#define _OCL_AES256_H_ + +#include "OCLTestImp.h" + +class OCLPerfAES256 : public OCLTestImp { + public: + OCLPerfAES256(); + virtual ~OCLPerfAES256(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void setData(cl_mem buffer, unsigned int data); + void checkData(cl_mem buffer); + + cl_context context_; + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + cl_mem inBuffer_; + cl_mem outBuffer_; + cl_mem tableBuffer_; + cl_mem keyBuffer_; + cl_int error_; + + unsigned int width_; + unsigned int bufSize_; + unsigned int blockSize_; + unsigned int maxIterations; + size_t numCUs; +}; + +#endif // _OCL_AES256_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed.cpp new file mode 100644 index 0000000000..af42569224 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed.cpp @@ -0,0 +1,817 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfAtomicSpeed.h" + +#include +#include +#include +#include +#include + +#include "CL/cl.h" +#include "OCLPerfAtomicSpeedKernels.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +// Define the test suite tests. +testOCLPerfAtomicSpeedStruct testOCLPerfAtomicSpeedList[] = { + {LocalHistogram, 1}, + {LocalHistogram, 2}, + {LocalHistogram, 4}, + {GlobalHistogram, 1}, + {GlobalHistogram, 2}, + {GlobalHistogram, 4}, + {Global4Histogram, 1}, + {Global4Histogram, 2}, + {Global4Histogram, 4}, + {LocalReductionNoAtomics, 1}, + {LocalReductionNoAtomics, 2}, + {LocalReductionNoAtomics, 4}, + {LocalReductionAtomics, 1}, + {LocalReductionAtomics, 2}, + {LocalReductionAtomics, 4}, + {Local4ReductionNoAtomics, 1}, + {Local4ReductionNoAtomics, 2}, + {Local4ReductionNoAtomics, 4}, + /* {Local4ReductionAtomics, 1}, + {Local4ReductionAtomics, 2}, + {Local4ReductionAtomics, 4},*/ + {GlobalWGReduction, 1}, + {GlobalWGReduction, 2}, + {GlobalWGReduction, 4}, + {GlobalAllToZeroReduction, 1}, + {GlobalAllToZeroReduction, 2}, + {GlobalAllToZeroReduction, 4}, + {Global4WGReduction, 1}, + {Global4WGReduction, 2}, + {Global4WGReduction, 4}, + {Global4AllToZeroReduction, 1}, + {Global4AllToZeroReduction, 2}, + {Global4AllToZeroReduction, 4}, +}; + +/////////////////////////////////////////////////////////////////////////////// +// OCLPerfAtomicSpeed implementation. +/////////////////////////////////////////////////////////////////////////////// +OCLPerfAtomicSpeed::OCLPerfAtomicSpeed() { + _atomicsSupported = false; + _dataSizeTooBig = false; + _numSubTests = + sizeof(testOCLPerfAtomicSpeedList) / sizeof(testOCLPerfAtomicSpeedStruct); + _numLoops = 10; + _nCurrentInputScale = 1; + _maxMemoryAllocationSize = 0; + + _input = NULL; + _output = NULL; + _inputBuffer = NULL; + _outputBuffer = NULL; + _workgroupSize = 256; + _programs.clear(); + _kernels.clear(); +} + +OCLPerfAtomicSpeed::~OCLPerfAtomicSpeed() {} + +void OCLPerfAtomicSpeed::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_int status = CL_SUCCESS; + + device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + _cpuReductionSum = 0; + _nCurrentInputScale = testOCLPerfAtomicSpeedList[_openTest].inputScale; + AtomicType atomicType = testOCLPerfAtomicSpeedList[_openTest].atomicType; + + // Setup stuff... + setupHistogram(); + calculateHostBin(); + + context_ = 0; + cmd_queue_ = 0; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + // Get last for default +#if 0 + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); +#if 0 + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + platform = platforms[i]; + break; + } +#endif + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { +#if 0 + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + isAMD = true; + } +#endif + platform = platforms[_platformIndex]; + } +#if 0 + } +#endif + delete platforms; + } + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, + "Couldn't find platform with GPU devices, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, NULL, NULL, &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + char charbuf[1024]; + size_t retsize; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024, + charbuf, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + // Global memory size + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(cl_ulong), + &_maxMemoryAllocationSize, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, + "clGetDeviceIDs(CL_DEVICE_GLOBAL_MEM_SIZE) failed"); + + // Check that the test size is not too big for the current GPU. + _dataSizeTooBig = false; + cl_ulong tenMB = 1024 * 10240; + if (_inputNBytes >= (_maxMemoryAllocationSize - tenMB)) { + _dataSizeTooBig = true; + return; + } + + char *p = strstr(charbuf, "cl_khr_global_int32_base_atomics"); + char *p2 = strstr(charbuf, "cl_khr_local_int32_base_atomics"); + + _atomicsSupported = false; + if (p || p2) _atomicsSupported = true; + + // Verify atomics are supported. + if (!_atomicsSupported) return; + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + // Create buffers... + _inputBuffer = + clCreateBuffer(context_, CL_MEM_READ_ONLY, _inputNBytes, 0, &status); + CHECK_RESULT(status, "clCreateBuffer failed. (inputBuffer)"); + + // Create the programs/kernels for the current test type. + CreateKernels(atomicType); + + _nThreadsPerGroup = _workgroupSize; + _nGroups = _nThreads / _nThreadsPerGroup; + _outputNBytes = _nGroups * NBINS * sizeof(cl_uint); + if (IsReduction(atomicType)) _outputNBytes = _inputNBytes; + + _output = (cl_uint *)malloc(_outputNBytes); + if (0 == _output) { + _dataSizeTooBig = true; + return; + } + + // Create output Buffer + _outputBuffer = + clCreateBuffer(context_, CL_MEM_READ_WRITE, _outputNBytes, 0, &status); + CHECK_RESULT(status, "clCreateBuffer failed. (outputBuffer)"); +} + +// Create the programs/kernels for the current test type. +void OCLPerfAtomicSpeed::CreateKernels(const AtomicType atomicType) { + char log[16384]; + cl_kernel kernel_; + cl_program program_; + char buildOptions[1000]; + cl_int status = CL_SUCCESS; + + SNPRINTF(buildOptions, sizeof(buildOptions), + "-D NBINS=%d -D BITS_PER_PIX=%d -D NBANKS=%d", NBINS, BITS_PER_PIX, + NBANKS); + + // Create the programs. + switch (atomicType) { + case LocalHistogram: + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&local_atomics_histogram, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + _programs.push_back(program_); + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&local_atomics_reduce, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + _programs.push_back(program_); + break; + case LocalReductionNoAtomics: + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&local_reduction, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + _programs.push_back(program_); + break; + case Local4ReductionNoAtomics: + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&local_vec4_reduction, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + _programs.push_back(program_); + break; + case LocalReductionAtomics: + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&local_atomics_reduction, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + _programs.push_back(program_); + break; + case Local4ReductionAtomics: + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&local_vec4_atomics_reduction, NULL, + &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + _programs.push_back(program_); + break; + case GlobalHistogram: + case Global4Histogram: + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&global_atomics_histogram, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + _programs.push_back(program_); + break; + case GlobalWGReduction: + case Global4WGReduction: + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&global_atomics_sum_reduction_workgroup, + NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + _programs.push_back(program_); + break; + case GlobalAllToZeroReduction: + case Global4AllToZeroReduction: + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&global_atomics_sum_reduction_all_to_zero, + NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + _programs.push_back(program_); + break; + default: + CHECK_RESULT(true, "Atomic type not supported (clCreateProgram)"); + } + // Build the programs. + for (size_t i = 0; i < _programs.size(); i++) { + error_ = _wrapper->clBuildProgram(_programs[i], 1, &device, buildOptions, + NULL, NULL); + if (error_ != CL_SUCCESS) { + status = _wrapper->clGetProgramBuildInfo(_programs[i], device, + CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + } + + switch (atomicType) { + case LocalHistogram: + kernel_ = _wrapper->clCreateKernel(_programs[0], + "local_atomics_histogram", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + _kernels.push_back(kernel_); + kernel_ = _wrapper->clCreateKernel(_programs[1], "local_atomics_reduce", + &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + _kernels.push_back(kernel_); + break; + case LocalReductionNoAtomics: + case Local4ReductionNoAtomics: + case LocalReductionAtomics: + case Local4ReductionAtomics: + kernel_ = + _wrapper->clCreateKernel(_programs[0], "local_reduction", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + _kernels.push_back(kernel_); + break; + case GlobalHistogram: + case Global4Histogram: + kernel_ = _wrapper->clCreateKernel(_programs[0], + "global_atomics_histogram", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + _kernels.push_back(kernel_); + break; + case GlobalWGReduction: + case Global4WGReduction: + kernel_ = _wrapper->clCreateKernel( + _programs[0], "global_atomics_sum_reduction_workgroup", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + _kernels.push_back(kernel_); + break; + case GlobalAllToZeroReduction: + case Global4AllToZeroReduction: + kernel_ = _wrapper->clCreateKernel( + _programs[0], "global_atomics_sum_reduction_all_to_zero", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + _kernels.push_back(kernel_); + break; + default: + CHECK_RESULT(true, "Atomic type not supported (clCreateKernel)"); + } +} + +// Sets the kernel arguments based on the current test type. +void OCLPerfAtomicSpeed::SetKernelArguments(const AtomicType atomicType) { + int Arg = 0; + int localSize = 0; + int itemsPerThread = 1; + cl_int status = CL_SUCCESS; + + switch (atomicType) { + case LocalHistogram: + // Set arguments for the local atomics histogram kernel + status = _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem), + (void *)&_inputBuffer); + CHECK_RESULT(status, "clSetKernelArg failed. (inputBuffer)"); + + status |= _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem), + (void *)&_outputBuffer); + CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)"); + + status |= _wrapper->clSetKernelArg(_kernels[0], Arg++, + sizeof(_n4VectorsPerThread), + (void *)&_n4VectorsPerThread); + CHECK_RESULT(status, "clSetKernelArg failed. (n4VectorsPerThread)"); + + // Set arguments for the local atomics reduce kernel + Arg = 0; + status |= _wrapper->clSetKernelArg(_kernels[1], Arg++, sizeof(cl_mem), + (void *)&_outputBuffer); + CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)"); + + status |= _wrapper->clSetKernelArg(_kernels[1], Arg++, sizeof(_nGroups), + (void *)&_nGroups); + CHECK_RESULT(status, "clSetKernelArg failed. (nGroups)"); + break; + case LocalReductionAtomics: + case LocalReductionNoAtomics: + case Local4ReductionNoAtomics: + case Local4ReductionAtomics: + status = _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem), + (void *)&_inputBuffer); + CHECK_RESULT(status, "clSetKernelArg failed. (inputBuffer)"); + + status |= _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem), + (void *)&_outputBuffer); + CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)"); + + localSize = DEFAULT_WG_SIZE * sizeof(cl_uint); + if ((Local4ReductionNoAtomics == atomicType) || + (Local4ReductionAtomics == atomicType)) + localSize *= 4; + status = _wrapper->clSetKernelArg(_kernels[0], Arg++, localSize, NULL); + CHECK_RESULT(status, "clSetKernelArg failed. (local memory)"); + break; + case GlobalHistogram: + case Global4Histogram: + case GlobalWGReduction: + case Global4WGReduction: + case GlobalAllToZeroReduction: + case Global4AllToZeroReduction: + // Set arguments for the global atomics histogram kernel + if ((Global4Histogram == atomicType) || + (Global4WGReduction == atomicType) || + (Global4AllToZeroReduction == atomicType)) + itemsPerThread = 4; + + status = _wrapper->clSetKernelArg( + _kernels[0], Arg++, sizeof(itemsPerThread), (void *)&itemsPerThread); + CHECK_RESULT(status, "clSetKernelArg failed. (itemsPerThread)"); + + status = _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem), + (void *)&_inputBuffer); + CHECK_RESULT(status, "clSetKernelArg failed. (inputBuffer)"); + + status |= _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem), + (void *)&_outputBuffer); + CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)"); + break; + default: + CHECK_RESULT(true, "Atomic type not supported (clSetKernelArg)"); + } +} + +// Since we write multiple times to the output in global atomics, need to +// reset the content every time. +void OCLPerfAtomicSpeed::ResetGlobalOutput() { + cl_int status; + + memset(_output, 0, _outputNBytes); + + status = + _wrapper->clEnqueueWriteBuffer(cmd_queue_, _outputBuffer, CL_TRUE, 0, + _outputNBytes, _output, 0, NULL, NULL); + CHECK_RESULT(status, "clEnqueueWriteBuffer failed."); + + status = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(status, "clFlush failed."); +} + +// Run the local histogram kernels. +void OCLPerfAtomicSpeed::RunLocalHistogram() { + cl_uint status; + cl_event events[2]; + size_t globalThreads[3] = {1}; + size_t localThreads[3] = {1}; + size_t globalThreadsReduce = NBINS; + size_t localThreadsReduce = _nThreadsPerGroup; + + globalThreads[0] = _nThreads; + localThreads[0] = _nThreadsPerGroup; + + status = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, _kernels[0], 1, NULL, + globalThreads, localThreads, 0, + NULL, &events[0]); + CHECK_RESULT(status, "clEnqueueNDRangeKernel failed. (histogram)"); + + status = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, _kernels[1], 1, NULL, &globalThreadsReduce, + &localThreadsReduce, 1, &events[0], &events[1]); + CHECK_RESULT(status, "clEnqueueNDRangeKernel failed. (reduce)"); + + status = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(status, "clFlush failed."); + + status = _wrapper->clWaitForEvents(1, &events[0]); + status |= _wrapper->clWaitForEvents(1, &events[1]); + CHECK_RESULT(status, "clWaitForEvents failed."); +} + +// Run the local reduction kernel. +void OCLPerfAtomicSpeed::RunLocalReduction(const AtomicType atomicType) { + cl_uint status; + size_t globalThreads[3] = {1}; + size_t localThreads[3] = {1}; + + globalThreads[0] = _inputNBytes / sizeof(cl_uint) / 2; + localThreads[0] = _nThreadsPerGroup; + if ((Local4ReductionNoAtomics == atomicType) || + (Local4ReductionAtomics == atomicType)) + globalThreads[0] /= 4; + + status = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, _kernels[0], 1, NULL, + globalThreads, localThreads, 0, + NULL, NULL); + CHECK_RESULT(status, "clEnqueueNDRangeKernel failed. (reduction)"); + + status = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(status, "clFlush failed."); +} + +// Run the global histogram kernel. +void OCLPerfAtomicSpeed::RunGlobalHistogram(AtomicType atomicType) { + cl_uint status; + size_t globalThreads[3] = {1}; + size_t localThreads[3] = {1}; + + globalThreads[0] = _inputNBytes / sizeof(cl_uint); + localThreads[0] = _nThreadsPerGroup; + + if ((Global4Histogram == atomicType) || (Global4WGReduction == atomicType) || + (Global4AllToZeroReduction == atomicType)) + globalThreads[0] /= 4; + + status = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, _kernels[0], 1, NULL, + globalThreads, localThreads, 0, + NULL, NULL); + CHECK_RESULT(status, "clEnqueueNDRangeKernel failed."); + + status = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(status, "clFlush failed."); +} + +// Run the AtomicSpeed logic. +void OCLPerfAtomicSpeed::run() { + int Arg = 0; + cl_uint status; + AtomicType atomicType = testOCLPerfAtomicSpeedList[_openTest].atomicType; + + // Verify atomics are supported. + if ((!_atomicsSupported) || (_dataSizeTooBig)) return; + + // Write data to the GPU + status = _wrapper->clEnqueueWriteBuffer(cmd_queue_, _inputBuffer, CL_FALSE, 0, + _inputNBytes, _input, 0, NULL, NULL); + CHECK_RESULT(status, "clEnqueueWriteBuffer failed. (inputBuffer)"); + + status = _wrapper->clFlush(cmd_queue_); + CHECK_RESULT(status, "clFlush failed."); + + // Set the current arguments based on the test type. + SetKernelArguments(atomicType); + + // Run the kernels. + CPerfCounter timer; + double totalTime = 0.0f; + + for (unsigned int k = 0; k < _numLoops + 1; k++) { + // Since we run multiple times using global atomics the output + // would get accumulated therefore first clean it. + ResetGlobalOutput(); + + timer.Reset(); + timer.Start(); + switch (atomicType) { + case LocalHistogram: + RunLocalHistogram(); + break; + case LocalReductionAtomics: + case LocalReductionNoAtomics: + case Local4ReductionNoAtomics: + case Local4ReductionAtomics: + RunLocalReduction(atomicType); + break; + case GlobalHistogram: + case Global4Histogram: + case GlobalWGReduction: + case Global4WGReduction: + case GlobalAllToZeroReduction: + case Global4AllToZeroReduction: + RunGlobalHistogram(atomicType); + break; + default: + CHECK_RESULT(true, "Atomic type not supported"); + } + timer.Stop(); + // Don't count the warm-up + if (0 != k) totalTime += timer.GetElapsedTime(); + } + + // Read the results back to the CPU - Only do it for the last run + // of the test instead of for each iteration of _numLoops. + status = _wrapper->clEnqueueReadBuffer(cmd_queue_, _outputBuffer, CL_FALSE, 0, + _outputNBytes, _output, 0, NULL, NULL); + CHECK_RESULT(status, "clEnqueueReadBuffer failed."); + status = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(status, "clFlush failed."); + + // Print the results. + PrintResults(atomicType, totalTime); + + // Check the results for the current test. + _errorFlag = !(VerifyResults(atomicType)); +} + +// Compare the results and see if they match +bool OCLPerfAtomicSpeed::VerifyResults(const AtomicType atomicType) { + cl_uint i = 0; + bool flag = true; + cl_uint calculatedValue = 0; + cl_uint reductionElementCount = 0; + switch (atomicType) { + case LocalHistogram: + case GlobalHistogram: + case Global4Histogram: + for (i = 0; i < NBINS; ++i) { + if (_cpuhist[i] != _output[i]) { + flag = false; + break; + } + } + break; + case LocalReductionAtomics: + case LocalReductionNoAtomics: + case Local4ReductionNoAtomics: + case Local4ReductionAtomics: + case GlobalWGReduction: + case Global4WGReduction: + reductionElementCount = + _inputNBytes / sizeof(cl_uint) / _nThreadsPerGroup; + for (i = 0; i < reductionElementCount; i++) { + calculatedValue += _output[i]; + } + flag = (calculatedValue == _cpuReductionSum); + break; + case GlobalAllToZeroReduction: + case Global4AllToZeroReduction: + flag = (_output[0] == _cpuReductionSum); + break; + default: + CHECK_RESULT_NO_RETURN(true, "Atomic type not supported (VerifyResults)"); + return false; + } + if (!flag) printf("WRONG VALUES!!!!!"); + return flag; +} + +unsigned int OCLPerfAtomicSpeed::close() { + size_t i = 0; + for (; i < _kernels.size(); i++) { + error_ = _wrapper->clReleaseKernel(_kernels[i]); + } + for (; i < _programs.size(); i++) { + error_ = _wrapper->clReleaseProgram(_programs[i]); + } + if (_inputBuffer) { + error_ = clReleaseMemObject(_inputBuffer); + CHECK_RESULT_NO_RETURN(error_, "clReleaseMemObject failed.(inputBuffer )"); + } + if (_outputBuffer) { + error_ = clReleaseMemObject(_outputBuffer); + CHECK_RESULT_NO_RETURN(error_, "clReleaseMemObject failed.(outputBuffer)"); + } + + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + // Free host memory. + free(_input); + free(_output); + + // Reset everything. + _kernels.clear(); + _programs.clear(); + _inputBuffer = NULL; + _outputBuffer = NULL; + cmd_queue_ = NULL; + context_ = NULL; + _input = NULL; + _output = NULL; + + return _crcword; +} + +/* Helper functions */ +void OCLPerfAtomicSpeed::calculateHostBin() { + // compute CPU histogram + cl_int *p = (cl_int *)_input; + memset(_cpuhist, 0, NBINS * sizeof(cl_uint)); + _cpuReductionSum = 0; + + for (unsigned int i = 0; i < _inputNBytes / sizeof(cl_uint); i++) { + _cpuhist[(p[i] >> 24) & 0xff]++; + _cpuhist[(p[i] >> 16) & 0xff]++; + _cpuhist[(p[i] >> 8) & 0xff]++; + _cpuhist[(p[i] >> 0) & 0xff]++; + _cpuReductionSum += ((p[i] >> 24) & 0x3) + ((p[i] >> 16) & 0x3) + + ((p[i] >> 8) & 0x3) + ((p[i] >> 0) & 0x3); + } +} + +void OCLPerfAtomicSpeed::setupHistogram() { + cl_int status = 0; + + _nThreads = 64 * 1024; +#if defined(_WIN32) && !defined(_WIN64) + _n4Vectors = 1024 * 1024; +#else + _n4Vectors = 2048 * 2048; +#endif + _n4Vectors *= _nCurrentInputScale; + _n4VectorsPerThread = _n4Vectors / _nThreads; + _inputNBytes = _n4Vectors * sizeof(cl_uint4); + + _input = (cl_uint *)malloc(_inputNBytes); + if (0 == _input) { + _dataSizeTooBig = true; + return; + } + + // random initialization of input + time_t ltime; + time(<ime); + cl_uint a = (cl_uint)ltime, b = (cl_uint)ltime; + cl_uint *p = (cl_uint *)_input; + + for (unsigned int i = 0; i < _inputNBytes / sizeof(cl_uint); i++) + p[i] = (b = (a * (b & 65535)) + (b >> 16)); +} + +// Print the results of the current test. +void OCLPerfAtomicSpeed::PrintResults(const AtomicType atomicType, + double totalTime) { + char buf[500]; + char sAtomicType[100]; + double inputInGB = (double)_inputNBytes * (double)(1e-09); + // each cl_uint in _inputNBytes contributes 4 items. + double totalHistogramDataInGB = (double)inputInGB * 4; + double perf = totalTime / _numLoops; + + switch (atomicType) { + case LocalHistogram: + SNPRINTF(sAtomicType, sizeof(sAtomicType), "Local histogram"); + break; + case GlobalHistogram: + SNPRINTF(sAtomicType, sizeof(sAtomicType), "Global histogram"); + break; + case Global4Histogram: + SNPRINTF(sAtomicType, sizeof(sAtomicType), "Global vec 4 histogram"); + break; + case LocalReductionNoAtomics: + SNPRINTF(sAtomicType, sizeof(sAtomicType), "Local reduction NO atomics"); + break; + case Local4ReductionNoAtomics: + SNPRINTF(sAtomicType, sizeof(sAtomicType), + "Local vec 4 reduction NO atomics"); + break; + case LocalReductionAtomics: + SNPRINTF(sAtomicType, sizeof(sAtomicType), + "Local reduction with atomics"); + break; + case Local4ReductionAtomics: + SNPRINTF(sAtomicType, sizeof(sAtomicType), + "Local vec 4 reduction with atomics"); + break; + case GlobalWGReduction: + SNPRINTF(sAtomicType, sizeof(sAtomicType), "Global work-group reduction"); + break; + case Global4WGReduction: + SNPRINTF(sAtomicType, sizeof(sAtomicType), + "Global vec 4 work-group reduction"); + break; + case GlobalAllToZeroReduction: + SNPRINTF(sAtomicType, sizeof(sAtomicType), + "Global all to zero reduction"); + break; + case Global4AllToZeroReduction: + SNPRINTF(sAtomicType, sizeof(sAtomicType), + "Global vec 4 all to zero reduction"); + break; + default: + CHECK_RESULT(true, "Atomic type not supported (PrintResults)"); + } + + SNPRINTF(buf, sizeof(buf), "%45s: Input [%.3f GB], Time [%.3f sec]: GB/s", + sAtomicType, totalHistogramDataInGB, perf); + _perfInfo = (float)(totalHistogramDataInGB / perf); + testDescString = buf; +} + +bool OCLPerfAtomicSpeed::IsReduction(const AtomicType atomicType) { + return ((atomicType >= LocalReductionNoAtomics) && + (atomicType <= GlobalAllToZeroReduction)); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed.h new file mode 100644 index 0000000000..1a94512866 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed.h @@ -0,0 +1,119 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_AtomicSpeed_H_ +#define _OCL_AtomicSpeed_H_ + +#include +#include +#include +#include + +#include "OCLTestImp.h" + +#define DEFAULT_WG_SIZE 256 +#define NBINS 256 +#define BITS_PER_PIX 8 +#define NBANKS 16 + +// Define the atomic type to test. +enum AtomicType { + LocalHistogram = 0, + GlobalHistogram, + Global4Histogram, + LocalReductionNoAtomics, + Local4ReductionNoAtomics, + LocalReductionAtomics, + Local4ReductionAtomics, + GlobalWGReduction, + Global4WGReduction, + GlobalAllToZeroReduction, + Global4AllToZeroReduction, +}; + +typedef struct { + AtomicType atomicType; + int inputScale; +} testOCLPerfAtomicSpeedStruct; + +// Define the OCLPerfAtomicSpeed class. +class OCLPerfAtomicSpeed : public OCLTestImp { + public: + OCLPerfAtomicSpeed(); + virtual ~OCLPerfAtomicSpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + cl_context context_; + cl_command_queue cmd_queue_; + std::vector _programs; + std::vector _kernels; + cl_device_id device; + + bool _atomicsSupported; + bool _dataSizeTooBig; + cl_uint _numLoops; + + // Histogram related stuff... + private: + cl_ulong _maxMemoryAllocationSize; + cl_uint _inputNBytes; + cl_uint _outputNBytes; + + cl_uint _nCurrentInputScale; + cl_uint _workgroupSize; + // cl_uint nLoops; + cl_uint _nThreads; + cl_uint _nThreadsPerGroup; + cl_uint _nGroups; + cl_uint _n4Vectors; + cl_uint _n4VectorsPerThread; + cl_uint _nBins; + cl_uint _nBytesLDSPerGrp; + + cl_uint* _input; + cl_uint* _output; + cl_mem _inputBuffer; + cl_mem _outputBuffer; + + cl_uint _cpuhist[NBINS]; + cl_uint _cpuReductionSum; + + void calculateHostBin(); + void setupHistogram(); + bool VerifyResults(const AtomicType atomicType); + void ResetGlobalOutput(); + + // Methods that does the actual NDRange. + void RunLocalHistogram(); + void RunLocalReduction(const AtomicType atomicType); + void RunGlobalHistogram(const AtomicType atomicType); + + void CreateKernels(const AtomicType atomicType); + bool IsReduction(const AtomicType atomicType); + void SetKernelArguments(const AtomicType atomicType); + void PrintResults(const AtomicType atomicType, double totalTime); +}; + +#endif // _OCL_AtomicSpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20.cpp new file mode 100644 index 0000000000..cf7716dfe8 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20.cpp @@ -0,0 +1,509 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfAtomicSpeed20.h" + +#include +#include +#include +#include +#include + +#include "CL/cl.h" +#include "OCLPerfAtomicSpeed20Kernels.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +// Define the test suite tests. +testOCLPerfAtomicSpeed20Struct testOCLPerfAtomicSpeed20List[] = { + {GlobalWGReduction, 1}, {GlobalWGReduction, 2}, + {GlobalWGReduction, 4}, {GlobalAllToZeroReduction, 1}, + {GlobalAllToZeroReduction, 2}, {GlobalAllToZeroReduction, 4}, + {Global4WGReduction, 1}, {Global4WGReduction, 2}, + {Global4WGReduction, 4}, {Global4AllToZeroReduction, 1}, + {Global4AllToZeroReduction, 2}, {Global4AllToZeroReduction, 4}, +}; + +/////////////////////////////////////////////////////////////////////////////// +// OCLPerfAtomicSpeed20 implementation. +/////////////////////////////////////////////////////////////////////////////// +OCLPerfAtomicSpeed20::OCLPerfAtomicSpeed20() { + _atomicsSupported = false; + _dataSizeTooBig = false; + _numSubTests = sizeof(testOCLPerfAtomicSpeed20List) / + sizeof(testOCLPerfAtomicSpeed20Struct); + _numLoops = 10; + _nCurrentInputScale = 1; + _maxMemoryAllocationSize = 0; + + _input = NULL; + _output = NULL; + _inputBuffer = NULL; + _outputBuffer = NULL; + + skip_ = false; + + _workgroupSize = 256; + _programs.clear(); + _kernels.clear(); +} + +OCLPerfAtomicSpeed20::~OCLPerfAtomicSpeed20() {} + +void OCLPerfAtomicSpeed20::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + error_ = CL_SUCCESS; + + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + program_ = 0; + kernel_ = 0; + +#if defined(CL_VERSION_2_0) + cl_device_id device; + cl_int status = CL_SUCCESS; + + conversion = 1.0f; + _openTest = test; + _cpuReductionSum = 0; + _nCurrentInputScale = testOCLPerfAtomicSpeed20List[_openTest].inputScale; + AtomicType atomicType = testOCLPerfAtomicSpeed20List[_openTest].atomicType; + + // Setup stuff... + setupHistogram(); + calculateHostBin(); + + device = devices_[_deviceId]; + + cmd_queue_ = cmdQueues_[_deviceId]; + + char charbuf[1024]; + size_t retsize; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024, + charbuf, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + // Global memory size + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(cl_ulong), + &_maxMemoryAllocationSize, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, + "clGetDeviceInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE) failed"); + + // Check that the test size is not too big for the current GPU. + _dataSizeTooBig = false; + cl_ulong tenMB = 1024 * 10240; + if (_inputNBytes >= (_maxMemoryAllocationSize - tenMB)) { + _dataSizeTooBig = true; + return; + } + + char *p = strstr(charbuf, "cl_khr_global_int32_base_atomics"); + + _atomicsSupported = false; + if (p) _atomicsSupported = true; + + // Verify atomics are supported. + if (!_atomicsSupported) return; + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + // Create buffers... + _inputBuffer = + clCreateBuffer(context_, CL_MEM_READ_ONLY, _inputNBytes, 0, &status); + CHECK_RESULT(status, "clCreateBuffer failed. (inputBuffer)"); + + // Create the programs/kernels for the current test type. + CreateKernels(atomicType); + + _nThreadsPerGroup = _workgroupSize; + _nGroups = _nThreads / _nThreadsPerGroup; + _outputNBytes = _inputNBytes; + + _output = (cl_uint *)malloc(_outputNBytes); + if (0 == _output) { + _dataSizeTooBig = true; + return; + } + + // Create output Buffer + _outputBuffer = + clCreateBuffer(context_, CL_MEM_READ_WRITE, _outputNBytes, 0, &status); + CHECK_RESULT(status, "clCreateBuffer failed. (outputBuffer)"); +#else + skip_ = true; + testDescString = "OpenCL verion < 2.0. Test Skipped."; + return; +#endif +} + +// Create the programs/kernels for the current test type. +void OCLPerfAtomicSpeed20::CreateKernels(const AtomicType atomicType) { + char log[16384]; + cl_kernel kernel_; + cl_program program_; + char buildOptions[1000]; + cl_int status = CL_SUCCESS; + cl_device_id device = devices_[_deviceId]; + + SNPRINTF(buildOptions, sizeof(buildOptions), + "-cl-std=CL2.0 -D NBINS=%d -D BITS_PER_PIX=%d -D NBANKS=%d", NBINS, + BITS_PER_PIX, NBANKS); + + // Create the programs. + switch (atomicType) { + case GlobalWGReduction: + case Global4WGReduction: + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&global_atomics_sum_reduction_workgroup, + NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + _programs.push_back(program_); + break; + case GlobalAllToZeroReduction: + case Global4AllToZeroReduction: + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&global_atomics_sum_reduction_all_to_zero, + NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + _programs.push_back(program_); + break; + default: + CHECK_RESULT(true, "Atomic type not supported (clCreateProgram)"); + } + // Build the programs. + for (size_t i = 0; i < _programs.size(); i++) { + error_ = _wrapper->clBuildProgram(_programs[i], 1, &device, buildOptions, + NULL, NULL); + if (error_ != CL_SUCCESS) { + status = _wrapper->clGetProgramBuildInfo(_programs[i], device, + CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + } + + switch (atomicType) { + case GlobalWGReduction: + case Global4WGReduction: + kernel_ = _wrapper->clCreateKernel( + _programs[0], "global_atomics_sum_reduction_workgroup", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + _kernels.push_back(kernel_); + break; + case GlobalAllToZeroReduction: + case Global4AllToZeroReduction: + kernel_ = _wrapper->clCreateKernel( + _programs[0], "global_atomics_sum_reduction_all_to_zero", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + _kernels.push_back(kernel_); + break; + default: + CHECK_RESULT(true, "Atomic type not supported (clCreateKernel)"); + } +} + +// Sets the kernel arguments based on the current test type. +void OCLPerfAtomicSpeed20::SetKernelArguments(const AtomicType atomicType) { + int Arg = 0; + int localSize = 0; + int itemsPerThread = 1; + cl_int status = CL_SUCCESS; + + switch (atomicType) { + case GlobalWGReduction: + case Global4WGReduction: + case GlobalAllToZeroReduction: + case Global4AllToZeroReduction: + // Set arguments for the global atomics histogram kernel + if ((Global4WGReduction == atomicType) || + (Global4AllToZeroReduction == atomicType)) + itemsPerThread = 4; + + status = _wrapper->clSetKernelArg( + _kernels[0], Arg++, sizeof(itemsPerThread), (void *)&itemsPerThread); + CHECK_RESULT(status, "clSetKernelArg failed. (itemsPerThread)"); + + status = _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem), + (void *)&_inputBuffer); + CHECK_RESULT(status, "clSetKernelArg failed. (inputBuffer)"); + + status |= _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem), + (void *)&_outputBuffer); + CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)"); + break; + default: + CHECK_RESULT(true, "Atomic type not supported (clSetKernelArg)"); + } +} + +// Since we write multiple times to the output in global atomics, need to +// reset the content every time. +void OCLPerfAtomicSpeed20::ResetGlobalOutput() { + cl_int status; + + memset(_output, 0, _outputNBytes); + + status = + _wrapper->clEnqueueWriteBuffer(cmd_queue_, _outputBuffer, CL_TRUE, 0, + _outputNBytes, _output, 0, NULL, NULL); + CHECK_RESULT(status, "clEnqueueWriteBuffer failed."); + + status = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(status, "clFlush failed."); +} + +// Run the global histogram kernel. +void OCLPerfAtomicSpeed20::RunGlobalHistogram(AtomicType atomicType) { + cl_uint status; + size_t globalThreads[3] = {1}; + size_t localThreads[3] = {1}; + + globalThreads[0] = _inputNBytes / sizeof(cl_uint); + localThreads[0] = _nThreadsPerGroup; + + if ((Global4WGReduction == atomicType) || + (Global4AllToZeroReduction == atomicType)) + globalThreads[0] /= 4; + + status = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, _kernels[0], 1, NULL, + globalThreads, localThreads, 0, + NULL, NULL); + CHECK_RESULT(status, "clEnqueueNDRangeKernel failed."); + + status = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(status, "clFlush failed."); +} + +// Run the AtomicSpeed logic. +void OCLPerfAtomicSpeed20::run() { + if (skip_) { + return; + } + +#if defined(CL_VERSION_2_0) + int Arg = 0; + cl_uint status; + AtomicType atomicType = testOCLPerfAtomicSpeed20List[_openTest].atomicType; + + // Verify atomics are supported. + if ((!_atomicsSupported) || (_dataSizeTooBig)) return; + + // Write data to the GPU + status = _wrapper->clEnqueueWriteBuffer(cmd_queue_, _inputBuffer, CL_FALSE, 0, + _inputNBytes, _input, 0, NULL, NULL); + CHECK_RESULT(status, "clEnqueueWriteBuffer failed. (inputBuffer)"); + + status = _wrapper->clFlush(cmd_queue_); + CHECK_RESULT(status, "clFlush failed."); + + // Set the current arguments based on the test type. + SetKernelArguments(atomicType); + + // Run the kernels. + CPerfCounter timer; + double totalTime = 0.0f; + + for (unsigned int k = 0; k < _numLoops + 1; k++) { + // Since we run multiple times using global atomics the output + // would get accumulated therefore first clean it. + ResetGlobalOutput(); + + timer.Reset(); + timer.Start(); + switch (atomicType) { + case GlobalWGReduction: + case Global4WGReduction: + case GlobalAllToZeroReduction: + case Global4AllToZeroReduction: + RunGlobalHistogram(atomicType); + break; + default: + CHECK_RESULT(true, "Atomic type not supported"); + } + timer.Stop(); + // Don't count the warm-up + if (0 != k) totalTime += timer.GetElapsedTime(); + } + + status = _wrapper->clEnqueueReadBuffer(cmd_queue_, _outputBuffer, CL_FALSE, 0, + _outputNBytes, _output, 0, NULL, NULL); + CHECK_RESULT(status, "clEnqueueReadBuffer failed."); + status = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(status, "clFlush failed."); + + // Print the results. + PrintResults(atomicType, totalTime); + + // Check the results for the current test. + _errorFlag = !(VerifyResults(atomicType)); +#endif +} + +// Compare the results and see if they match +bool OCLPerfAtomicSpeed20::VerifyResults(const AtomicType atomicType) { + cl_uint i = 0; + bool flag = true; + cl_uint calculatedValue = 0; + cl_uint reductionElementCount = 0; + switch (atomicType) { + case GlobalWGReduction: + case Global4WGReduction: + reductionElementCount = + _inputNBytes / sizeof(cl_uint) / _nThreadsPerGroup; + for (i = 0; i < reductionElementCount; i++) { + calculatedValue += _output[i]; + } + flag = (calculatedValue == _cpuReductionSum); + break; + case GlobalAllToZeroReduction: + case Global4AllToZeroReduction: + flag = (_output[0] == _cpuReductionSum); + break; + default: + CHECK_RESULT_NO_RETURN(true, "Atomic type not supported (VerifyResults)"); + return false; + } + if (!flag) printf("WRONG VALUES!!!!!"); + return flag; +} + +unsigned int OCLPerfAtomicSpeed20::close() { + size_t i = 0; + for (; i < _kernels.size(); i++) { + error_ = _wrapper->clReleaseKernel(_kernels[i]); + } + for (; i < _programs.size(); i++) { + error_ = _wrapper->clReleaseProgram(_programs[i]); + } + + if (_inputBuffer) { + error_ = clReleaseMemObject(_inputBuffer); + CHECK_RESULT_NO_RETURN(error_, "clReleaseMemObject failed.(inputBuffer )"); + } + if (_outputBuffer) { + error_ = clReleaseMemObject(_outputBuffer); + CHECK_RESULT_NO_RETURN(error_, "clReleaseMemObject failed.(outputBuffer)"); + } + + // Free host memory. + free(_input); + free(_output); + + // Reset everything. + _kernels.clear(); + _programs.clear(); + + _inputBuffer = NULL; + _outputBuffer = NULL; + + _input = NULL; + _output = NULL; + + return OCLTestImp::close(); +} + +/* Helper functions */ +void OCLPerfAtomicSpeed20::calculateHostBin() { + // compute CPU histogram + cl_int *p = (cl_int *)_input; + memset(_cpuhist, 0, NBINS * sizeof(cl_uint)); + _cpuReductionSum = 0; + + for (unsigned int i = 0; i < _inputNBytes / sizeof(cl_uint); i++) { + _cpuhist[(p[i] >> 24) & 0xff]++; + _cpuhist[(p[i] >> 16) & 0xff]++; + _cpuhist[(p[i] >> 8) & 0xff]++; + _cpuhist[(p[i] >> 0) & 0xff]++; + _cpuReductionSum += ((p[i] >> 24) & 0x3) + ((p[i] >> 16) & 0x3) + + ((p[i] >> 8) & 0x3) + ((p[i] >> 0) & 0x3); + } +} + +void OCLPerfAtomicSpeed20::setupHistogram() { + cl_int status = 0; + + _nThreads = 64 * 1024; + _n4Vectors = 2048 * 2048; + _n4Vectors *= _nCurrentInputScale; + _n4VectorsPerThread = _n4Vectors / _nThreads; + _inputNBytes = _n4Vectors * sizeof(cl_uint4); + + _input = (cl_uint *)malloc(_inputNBytes); + if (0 == _input) { + _dataSizeTooBig = true; + return; + } + + // random initialization of input + time_t ltime; + time(<ime); + cl_uint a = (cl_uint)ltime, b = (cl_uint)ltime; + cl_uint *p = (cl_uint *)_input; + + for (unsigned int i = 0; i < _inputNBytes / sizeof(cl_uint); i++) + p[i] = (b = (a * (b & 65535)) + (b >> 16)); +} + +// Print the results of the current test. +void OCLPerfAtomicSpeed20::PrintResults(const AtomicType atomicType, + double totalTime) { + char buf[500]; + char sAtomicType[100]; + double inputInGB = (double)_inputNBytes * (double)(1e-09); + // each cl_uint in _inputNBytes contributes 4 items. + double totalHistogramDataInGB = (double)inputInGB * 4; + double perf = totalTime / _numLoops; + + switch (atomicType) { + case GlobalWGReduction: + SNPRINTF(sAtomicType, sizeof(sAtomicType), "Global work-group reduction"); + break; + case Global4WGReduction: + SNPRINTF(sAtomicType, sizeof(sAtomicType), + "Global vec 4 work-group reduction"); + break; + case GlobalAllToZeroReduction: + SNPRINTF(sAtomicType, sizeof(sAtomicType), + "Global all to zero reduction"); + break; + case Global4AllToZeroReduction: + SNPRINTF(sAtomicType, sizeof(sAtomicType), + "Global vec 4 all to zero reduction"); + break; + default: + CHECK_RESULT(true, "Atomic type not supported (PrintResults)"); + } + + SNPRINTF(buf, sizeof(buf), "%45s: Input [%.3f GB], Time [%.3f sec]: GB/s", + sAtomicType, totalHistogramDataInGB, perf); + _perfInfo = (float)(totalHistogramDataInGB / perf); + testDescString = buf; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20.h new file mode 100644 index 0000000000..b3c39da048 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20.h @@ -0,0 +1,102 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_AtomicSpeed20_H_ +#define _OCL_AtomicSpeed20_H_ + +#include +#include +#include +#include + +#include "OCLTestImp.h" + +#define DEFAULT_WG_SIZE 256 +#define NBINS 256 +#define BITS_PER_PIX 8 +#define NBANKS 16 + +#include "OCLPerfAtomicSpeed.h" + +typedef struct { + AtomicType atomicType; + int inputScale; +} testOCLPerfAtomicSpeed20Struct; + +// Define the OCLPerfAtomicSpeed20 class. +class OCLPerfAtomicSpeed20 : public OCLTestImp { + public: + OCLPerfAtomicSpeed20(); + virtual ~OCLPerfAtomicSpeed20(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + cl_command_queue cmd_queue_; + std::vector _programs; + std::vector _kernels; + + bool _atomicsSupported; + bool _dataSizeTooBig; + cl_uint _numLoops; + + // Histogram related stuff... + private: + cl_ulong _maxMemoryAllocationSize; + cl_uint _inputNBytes; + cl_uint _outputNBytes; + + cl_uint _nCurrentInputScale; + cl_uint _workgroupSize; + // cl_uint nLoops; + cl_uint _nThreads; + cl_uint _nThreadsPerGroup; + cl_uint _nGroups; + cl_uint _n4Vectors; + cl_uint _n4VectorsPerThread; + cl_uint _nBins; + cl_uint _nBytesLDSPerGrp; + + cl_uint* _input; + cl_uint* _output; + cl_mem _inputBuffer; + cl_mem _outputBuffer; + bool skip_; + + cl_uint _cpuhist[NBINS]; + cl_uint _cpuReductionSum; + + void calculateHostBin(); + void setupHistogram(); + bool VerifyResults(const AtomicType atomicType); + void ResetGlobalOutput(); + + // Methods that does the actual NDRange. + void RunGlobalHistogram(const AtomicType atomicType); + + void CreateKernels(const AtomicType atomicType); + void SetKernelArguments(const AtomicType atomicType); + void PrintResults(const AtomicType atomicType, double totalTime); +}; + +#endif // _OCL_AtomicSpeed20_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20Kernels.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20Kernels.h new file mode 100644 index 0000000000..e3697c4f9a --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeed20Kernels.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +static const char *global_atomics_sum_reduction_all_to_zero = + "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + " __kernel void global_atomics_sum_reduction_all_to_zero(uint " + "ItemsPerThread, __global uint *Input, __global atomic_int *Output )\n" + "{\n" + " uint sum = 0;\n" + " const uint msk = (uint)3;\n" + " const uint shft = (uint)8;\n" + " \n" + " uint tid = get_global_id(0);\n" + " uint Stride = get_global_size(0);\n" + " for( int i = 0; i < ItemsPerThread; i++)\n" + " {\n" + " uint data = Input[tid];\n" + " sum += data & msk;\n" + " data = data >> shft;" + " sum += data & msk;\n" + " data = data >> shft;" + " sum += data & msk;\n" + " data = data >> shft;" + " sum += data & msk;\n" + " tid += Stride;\n" + " }\n" + " atomic_fetch_add_explicit( &(Output[0]), sum, memory_order_relaxed, " + "memory_scope_device);\n" + "}\n"; + +static const char *global_atomics_sum_reduction_workgroup = + "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + " __kernel void global_atomics_sum_reduction_workgroup(uint " + "ItemsPerThread, __global uint *Input, __global atomic_int *Output )\n" + "{\n" + " uint sum = 0;\n" + " const uint msk = (uint)3;\n" + " const uint shft = (uint)8;\n" + " \n" + " uint tid = get_global_id(0);\n" + " uint Stride = get_global_size(0);\n" + " for( int i = 0; i < ItemsPerThread; i++)\n" + " {\n" + " uint data = Input[tid];\n" + " sum += data & msk;\n" + " data = data >> shft;" + " sum += data & msk;\n" + " data = data >> shft;" + " sum += data & msk;\n" + " data = data >> shft;" + " sum += data & msk;\n" + " tid += Stride;\n" + " }\n" + " atomic_fetch_add_explicit( &(Output[get_group_id(0)]), sum, " + "memory_order_relaxed, memory_scope_device);\n" + "}\n"; diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeedKernels.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeedKernels.h new file mode 100644 index 0000000000..defbff4e8f --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfAtomicSpeedKernels.h @@ -0,0 +1,402 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +static const char *local_atomics_histogram = + "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" + "#define MIN(a,b) ((a) < (b)) ? (a) : (b) \n" + "#define MAX(a,b) ((a) > (b)) ? (a) : (b) \n" + "__kernel __attribute__((reqd_work_group_size(256,1,1)))\n" + "void local_atomics_histogram(__global uint4 *Image,\n" + "__global uint *Histogram,\n" + "uint n4VectorsPerThread)\n" + "{\n" + " __local __attribute__((aligned(16))) uint subhists[NBANKS * NBINS];\n" + "\n" + " uint tid = get_global_id(0);\n" + " uint ltid = get_local_id(0);\n" + " uint Stride = get_global_size(0);\n" + "\n" + " uint i, idx;\n" + " uint4 temp, temp2;\n" + " const uint shft = (uint) BITS_PER_PIX;\n" + " const uint msk = (uint) (NBINS-1);\n" + " uint offset = (uint) ltid % (uint) (NBANKS);\n" + "\n" + " uint lmem_items = NBANKS * NBINS;\n" + " uint lmem_items_per_thread;\n" + " uint lmem_max_threads;\n" + "\n" + " // parallel LDS clear\n" + " // first, calculate threads per item, at least 1:\n" + " lmem_max_threads = MIN( 1, get_local_size(0) / lmem_items );\n" + " // but no more than we have items:\n" + " lmem_max_threads = MAX( 1, lmem_max_threads / lmem_items );\n" + " // calculate threads total:\n" + " lmem_max_threads = lmem_items / lmem_max_threads;\n" + " // but no more than LDS banks:\n" + " lmem_max_threads = MIN( get_local_size(0), lmem_max_threads );\n" + "\n" + " lmem_items_per_thread = lmem_items / lmem_max_threads;\n" + "\n" + " // now, clear LDS\n" + " __local uint4 *p = (__local uint4 *) subhists;\n" + "\n" + " if( ltid < lmem_max_threads )\n" + " {\n" + " for(i=0, idx=ltid; i> shft;\n" + " temp2 = (temp & msk) * (uint4) NBANKS + offset;\n" + "\n" + " (void) atom_inc( subhists + temp2.x );\n" + " (void) atom_inc( subhists + temp2.y );\n" + " (void) atom_inc( subhists + temp2.z );\n" + " (void) atom_inc( subhists + temp2.w );\n" + "\n" + " temp = temp >> shft;\n" + " temp2 = (temp & msk) * (uint4) NBANKS + offset;\n" + "\n" + " (void) atom_inc( subhists + temp2.x );\n" + " (void) atom_inc( subhists + temp2.y );\n" + " (void) atom_inc( subhists + temp2.z );\n" + " (void) atom_inc( subhists + temp2.w );\n" + "\n" + " temp = temp >> shft;\n" + " temp2 = (temp & msk) * (uint4) NBANKS + offset;\n" + "\n" + " (void) atom_inc( subhists + temp2.x );\n" + " (void) atom_inc( subhists + temp2.y );\n" + " (void) atom_inc( subhists + temp2.z );\n" + " (void) atom_inc( subhists + temp2.w );\n" + " }\n" + "\n" + " barrier( CLK_LOCAL_MEM_FENCE );\n" + "\n" + " // reduce __local banks to single histogram per work-group\n" + "\n" + " if( ltid < NBINS )\n" + " {\n" + " uint bin = 0;\n" + " for( i=0; i> shft;\n" + " atom_inc( &(Histogram[ (temp & msk) ]) );\n" + " temp = temp >> shft;\n" + " atom_inc( &(Histogram[ (temp & msk) ]) );\n" + " temp = temp >> shft;\n" + " atom_inc( &(Histogram[ (temp & msk) ]) );\n" + " tid += Stride;" + " }\n" + "}\n"; + +static const char *global_vec4_atomics_histogram = + "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + "__kernel __attribute__((reqd_work_group_size(256,1,1)))\n" + "void global_atomics_histogram(uint ItemsPerThread,\n" + "__global uint4 *Input,\n" + "__global uint *Histogram)\n" + "{\n" + " uint tid = get_global_id(0);\n" + " const uint shft = (uint) BITS_PER_PIX;\n" + " const uint msk = (uint) (NBINS-1);\n" + " uint Stride = get_global_size(0);\n" + " for( int i = 0; i < ItemsPerThread; i++)\n" + " {\n" + " uint4 temp = Input[tid];\n" + " atom_inc( &(Histogram[ (temp.x & msk) ]) );\n" + " atom_inc( &(Histogram[ (temp.y & msk) ]) );\n" + " atom_inc( &(Histogram[ (temp.z & msk) ]) );\n" + " atom_inc( &(Histogram[ (temp.w & msk) ]) );\n" + " temp = temp >> shft;\n" + " atom_inc( &(Histogram[ (temp.x & msk) ]) );\n" + " atom_inc( &(Histogram[ (temp.y & msk) ]) );\n" + " atom_inc( &(Histogram[ (temp.z & msk) ]) );\n" + " atom_inc( &(Histogram[ (temp.w & msk) ]) );\n" + " temp = temp >> shft;\n" + " atom_inc( &(Histogram[ (temp.x & msk) ]) );\n" + " atom_inc( &(Histogram[ (temp.y & msk) ]) );\n" + " atom_inc( &(Histogram[ (temp.z & msk) ]) );\n" + " atom_inc( &(Histogram[ (temp.w & msk) ]) );\n" + " temp = temp >> shft;\n" + " atom_inc( &(Histogram[ (temp.x & msk) ]) );\n" + " atom_inc( &(Histogram[ (temp.y & msk) ]) );\n" + " atom_inc( &(Histogram[ (temp.z & msk) ]) );\n" + " atom_inc( &(Histogram[ (temp.w & msk) ]) );\n" + " tid += Stride;" + " }\n" + "}\n"; + +static const char *global_atomics_sum_reduction_all_to_zero = + "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + " __kernel void global_atomics_sum_reduction_all_to_zero(uint " + "ItemsPerThread, __global uint *Input, __global int *Output )\n" + "{\n" + " uint sum = 0;\n" + " const uint msk = (uint)3;\n" + " const uint shft = (uint)8;\n" + " \n" + " uint tid = get_global_id(0);\n" + " uint Stride = get_global_size(0);\n" + " for( int i = 0; i < ItemsPerThread; i++)\n" + " {\n" + " uint data = Input[tid];\n" + " sum += data & msk;\n" + " data = data >> shft;" + " sum += data & msk;\n" + " data = data >> shft;" + " sum += data & msk;\n" + " data = data >> shft;" + " sum += data & msk;\n" + " tid += Stride;\n" + " }\n" + " atom_add( &(Output[0]), sum);\n" + "}\n"; + +static const char *global_atomics_sum_reduction_workgroup = + "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + " __kernel void global_atomics_sum_reduction_workgroup(uint " + "ItemsPerThread, __global uint *Input, __global int *Output )\n" + "{\n" + " uint sum = 0;\n" + " const uint msk = (uint)3;\n" + " const uint shft = (uint)8;\n" + " \n" + " uint tid = get_global_id(0);\n" + " uint Stride = get_global_size(0);\n" + " for( int i = 0; i < ItemsPerThread; i++)\n" + " {\n" + " uint data = Input[tid];\n" + " sum += data & msk;\n" + " data = data >> shft;" + " sum += data & msk;\n" + " data = data >> shft;" + " sum += data & msk;\n" + " data = data >> shft;" + " sum += data & msk;\n" + " tid += Stride;\n" + " }\n" + " atom_add( &(Output[get_group_id(0)]), sum);\n" + "}\n"; + +static const char *local_reduction = + "__kernel void local_reduction(__global uint* input, __global uint* " + "output, __local uint* sdata)\n" + "{\n" + " // load shared mem\n" + " const uint msk = (uint)3;\n" + " const uint shft = (uint)8;\n" + " unsigned int tid = get_local_id(0);\n" + "\n" + " unsigned int localSize = get_local_size(0);\n" + " unsigned int stride = get_global_id(0) * 2;\n" + " unsigned int data1 = input[stride];\n" + " unsigned int data2 = input[stride + 1];\n" + " unsigned int sum = 0;\n" + " for( int i = 0; i < 4; i++)\n" + " {\n" + " sum += (data1 & msk) + (data2 & msk);\n" + " data1 = data1 >> shft;\n" + " data2 = data2 >> shft;\n" + " }\n" + " sdata[tid] = sum;" + "\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " // do reduction in shared mem\n" + " for(unsigned int s = localSize >> 1; s > 0; s >>= 1)\n" + " {\n" + " if(tid < s) \n" + " {\n" + " sdata[tid] += sdata[tid + s];\n" + " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " }\n" + "\n" + " // write result for this block to global mem\n" + " if(tid == 0) output[get_group_id(0)] = sdata[0];\n" + "}\n"; + +static const char *local_vec4_reduction = + "__kernel void local_reduction(__global uint4* input, __global uint4* " + "output, __local uint4* sdata)\n" + "{\n" + " // load shared mem\n" + " const uint msk = (uint)3;\n" + " const uint shft = (uint)8;\n" + " unsigned int tid = get_local_id(0);\n" + "\n" + " unsigned int localSize = get_local_size(0);\n" + " unsigned int stride = get_global_id(0) * 2;\n" + " uint4 data1 = input[stride];\n" + " uint4 data2 = input[stride + 1];\n" + " uint4 sum = 0;\n" + " for( int i = 0; i < 4; i++)\n" + " {\n" + " sum += (data1 & msk) + (data2 & msk);\n" + " data1 = data1 >> shft;\n" + " data2 = data2 >> shft;\n" + " }\n" + " sdata[tid] = sum;" + "\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " // do reduction in shared mem\n" + " for(unsigned int s = localSize >> 1; s > 0; s >>= 1)\n" + " {\n" + " if(tid < s) \n" + " {\n" + " sdata[tid] += sdata[tid + s];\n" + " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " }\n" + "\n" + " // write result for this block to global mem\n" + " if(tid == 0) output[get_group_id(0)] = sdata[0];\n" + "}\n"; + +static const char *local_atomics_reduction = + "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" + "__kernel void local_reduction(__global uint* input, __global uint* " + "output, __local uint* sdata)\n" + "{\n" + " // load shared mem\n" + " const uint msk = (uint)3;\n" + " const uint shft = (uint)8;\n" + " unsigned int tid = get_local_id(0);\n" + "\n" + " unsigned int localSize = get_local_size(0);\n" + " unsigned int stride = get_global_id(0) * 2;\n" + " unsigned int data1 = input[stride];\n" + " unsigned int data2 = input[stride + 1];\n" + " unsigned int sum = 0;\n" + " for( int i = 0; i < 4; i++)\n" + " {\n" + " sum += (data1 & msk) + (data2 & msk);\n" + " data1 = data1 >> shft;\n" + " data2 = data2 >> shft;\n" + " }\n" + " sdata[tid] = sum;" + "\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " // do reduction in shared mem\n" + " for(unsigned int s = localSize >> 1; s > 0; s >>= 1)\n" + " {\n" + " if(tid < s) \n" + " {\n" + " atom_add( &(sdata[tid]), sdata[tid + s]);\n" + " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " }\n" + "\n" + " // write result for this block to global mem\n" + " if(tid == 0) output[get_group_id(0)] = sdata[0];\n" + "}\n"; + +static const char *local_vec4_atomics_reduction = + "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" + "__kernel void local_reduction(__global uint4* input, __global uint4* " + "output, __local uint4* sdata)\n" + "{\n" + " // load shared mem\n" + " const uint msk = (uint)3;\n" + " const uint shft = (uint)8;\n" + " unsigned int tid = get_local_id(0);\n" + "\n" + " unsigned int localSize = get_local_size(0);\n" + " unsigned int stride = get_global_id(0) * 2;\n" + " uint4 data1 = input[stride];\n" + " uint4 data2 = input[stride + 1];\n" + " uint4 sum = 0;\n" + " for( int i = 0; i < 4; i++)\n" + " {\n" + " sum += (data1 & msk) + (data2 & msk);\n" + " data1 = data1 >> shft;\n" + " data2 = data2 >> shft;\n" + " }\n" + " sdata[tid] = sum;" + "\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " // do reduction in shared mem\n" + " for(unsigned int s = localSize >> 1; s > 0; s >>= 1)\n" + " {\n" + " if(tid < s) \n" + " {\n" + " atom_add( &(sdata[tid]).x, sdata[tid + s].x);\n" + " atom_add( &(sdata[tid]).y, sdata[tid + s].y);\n" + " atom_add( &(sdata[tid]).z, sdata[tid + s].z);\n" + " atom_add( &(sdata[tid]).w, sdata[tid + s].w);\n" + " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " }\n" + "\n" + " // write result for this block to global mem\n" + " if(tid == 0) output[get_group_id(0)] = sdata[0];\n" + "}\n"; diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopyOverhead.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopyOverhead.cpp new file mode 100644 index 0000000000..0cfb9de532 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopyOverhead.cpp @@ -0,0 +1,254 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfBufferCopyOverhead.h" + +#include +#include +#include + +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +typedef struct { + unsigned int iterations; + int flushEvery; +} testStruct; + +static testStruct testList[] = { + {1, -1}, {1, -1}, {10, 1}, {10, -1}, {100, 1}, + {100, 10}, {100, -1}, {1000, 1}, {1000, 10}, {1000, 100}, + {1000, -1}, {10000, 1}, {10000, 10}, {10000, 100}, {10000, 1000}, + {10000, -1}, {100000, 1}, {100000, 10}, {100000, 100}, {100000, 1000}, + {100000, 10000}, {100000, -1}, +}; + +OCLPerfBufferCopyOverhead::OCLPerfBufferCopyOverhead() { + _numSubTests = 2 * 2 * sizeof(testList) / sizeof(testStruct); +} + +OCLPerfBufferCopyOverhead::~OCLPerfBufferCopyOverhead() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfBufferCopyOverhead::open(unsigned int test, char *units, + double &conversion, + unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test % (sizeof(testList) / sizeof(testStruct)); + + context_ = 0; + cmd_queue_ = 0; + srcBuffer_ = 0; + dstBuffer_ = 0; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + delete platforms; + } + + bufSize_ = 4; + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_mem_flags flags = CL_MEM_READ_ONLY; + sleep = ((test / (sizeof(testList) / sizeof(testStruct))) % 2) > 0; + if (test >= ((sizeof(testList) / sizeof(testStruct)) * 2)) { + srcHost = true; + flags |= CL_MEM_ALLOC_HOST_PTR; + } else { + srcHost = false; + } + srcBuffer_ = + _wrapper->clCreateBuffer(context_, flags, bufSize_, NULL, &error_); + CHECK_RESULT(srcBuffer_ == 0, "clCreateBuffer(srcBuffer) failed"); + + flags = CL_MEM_WRITE_ONLY; + if (!srcHost) { + flags |= CL_MEM_ALLOC_HOST_PTR; + } + dstBuffer_ = + _wrapper->clCreateBuffer(context_, flags, bufSize_, NULL, &error_); + CHECK_RESULT(dstBuffer_ == 0, "clCreateBuffer(dstBuffer) failed"); +} + +void OCLPerfBufferCopyOverhead::run(void) { + CPerfCounter timer; + cl_event event; + cl_int eventStatus; + unsigned int iter = testList[_openTest].iterations; + + // Warm up + error_ = _wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, dstBuffer_, 0, + 0, bufSize_, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueCopyBuffer failed"); + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < iter; i++) { + error_ = _wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, dstBuffer_, + 0, 0, bufSize_, 0, NULL, &event); + + CHECK_RESULT(error_, "clEnqueueCopyBuffer failed"); + if ((testList[_openTest].flushEvery > 0) && + (((i + 1) % testList[_openTest].flushEvery) == 0)) { + if (sleep) { + _wrapper->clFinish(cmd_queue_); + } else { + _wrapper->clFlush(cmd_queue_); + error_ = + _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(cl_int), &eventStatus, NULL); + while (eventStatus > 0) { + error_ = + _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(cl_int), &eventStatus, NULL); + } + } + } + if (i != (iter - 1)) { + _wrapper->clReleaseEvent(event); + } + } + if (sleep) { + _wrapper->clFinish(cmd_queue_); + } else { + _wrapper->clFlush(cmd_queue_); + error_ = _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(cl_int), &eventStatus, NULL); + while (eventStatus > 0) { + error_ = + _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(cl_int), &eventStatus, NULL); + } + } + _wrapper->clReleaseEvent(event); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Buffer copy time in us + double perf = sec * 1000. * 1000. / iter; + + const char *strSrc = NULL; + const char *strDst = NULL; + const char *strWait = NULL; + if (srcHost) { + strSrc = "host"; + strDst = "dev"; + } else { + strSrc = "dev"; + strDst = "host"; + } + if (sleep) { + strWait = "sleep"; + } else { + strWait = "spin"; + } + _perfInfo = (float)perf; + char buf[256]; + SNPRINTF(buf, sizeof(buf), " %5s, s:%4s d:%4s i:%6d (us) ", strWait, strSrc, + strDst, iter); + testDescString = buf; +} + +unsigned int OCLPerfBufferCopyOverhead::close(void) { + if (srcBuffer_) { + error_ = _wrapper->clReleaseMemObject(srcBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(srcBuffer_) failed"); + } + if (dstBuffer_) { + error_ = _wrapper->clReleaseMemObject(dstBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(dstBuffer_) failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopyOverhead.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopyOverhead.h new file mode 100644 index 0000000000..983fdd51ef --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopyOverhead.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_BufferCopyOverhead_H_ +#define _OCL_BufferCopyOverhead_H_ + +#include "OCLTestImp.h" + +class OCLPerfBufferCopyOverhead : public OCLTestImp { + public: + OCLPerfBufferCopyOverhead(); + virtual ~OCLPerfBufferCopyOverhead(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 1000; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_mem srcBuffer_; + cl_mem dstBuffer_; + cl_int error_; + + unsigned int bufSize_; + bool sleep; + bool srcHost; +}; + +#endif // _OCL_BufferCopyOverhead_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopySpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopySpeed.cpp new file mode 100644 index 0000000000..13256a39ba --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopySpeed.cpp @@ -0,0 +1,439 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfBufferCopySpeed.h" + +#include +#include +#include + +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 8 +// 4KB, 8KB, 64KB, 256KB, 1 MB, 4MB, 16 MB, 16MB+10 +static const unsigned int Sizes[NUM_SIZES] = { + 4096, 8192, 65536, 262144, 1048576, 4194304, 16777216, 16777216 + 10}; + +static const unsigned int Iterations[2] = {1, OCLPerfBufferCopySpeed::NUM_ITER}; + +#define BUF_TYPES 4 +// 16 ways to combine 4 different buffer types +#define NUM_SUBTESTS (BUF_TYPES * BUF_TYPES) + +OCLPerfBufferCopySpeed::OCLPerfBufferCopySpeed() { + _numSubTests = NUM_SIZES * NUM_SUBTESTS * 2; +} + +OCLPerfBufferCopySpeed::~OCLPerfBufferCopySpeed() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfBufferCopySpeed::setData(void *ptr, unsigned int size, + unsigned int value) { + unsigned int *ptr2 = (unsigned int *)ptr; + value = 0; + for (unsigned int i = 0; i < size >> 2; i++) { + ptr2[i] = value; + value++; + } +} + +void OCLPerfBufferCopySpeed::checkData(void *ptr, unsigned int size, + unsigned int value) { + unsigned int *ptr2 = (unsigned int *)ptr; + value = 0; + for (unsigned int i = 0; i < size >> 2; i++) { + if (ptr2[i] != value) { + printf("Data validation failed at %d! Got 0x%08x 0x%08x 0x%08x 0x%08x\n", + i, ptr2[i], ptr2[i + 1], ptr2[i + 2], ptr2[i + 3]); + printf("Expected 0x%08x 0x%08x 0x%08x 0x%08x\n", value, value, value, + value); + CHECK_RESULT(true, "Data validation failed!"); + break; + } + value++; + } +} + +void OCLPerfBufferCopySpeed::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + srcBuffer_ = 0; + dstBuffer_ = 0; + persistent[0] = false; + persistent[1] = false; + allocHostPtr[0] = false; + allocHostPtr[1] = false; + useHostPtr[0] = false; + useHostPtr[1] = false; + memptr[0] = NULL; + memptr[1] = NULL; + alignedmemptr[0] = NULL; + alignedmemptr[1] = NULL; + isAMD = false; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + isAMD = true; + } + // platform = platforms[_platformIndex]; + // break; + } +#if 0 + } +#endif + delete platforms; + } + + char getVersion[128]; + error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION, + sizeof(getVersion), getVersion, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed"); + platformVersion[0] = getVersion[7]; + platformVersion[1] = getVersion[8]; + platformVersion[2] = getVersion[9]; + platformVersion[3] = '\0'; + bufSize_ = Sizes[_openTest % NUM_SIZES]; + unsigned int srcTest = (_openTest / NUM_SIZES) % BUF_TYPES; + unsigned int dstTest = (_openTest / (NUM_SIZES * BUF_TYPES)) % BUF_TYPES; + if (srcTest == 3) { + useHostPtr[0] = true; + } else if ((srcTest == 2) && isAMD) { + persistent[0] = true; + } else if (srcTest == 1) { + allocHostPtr[0] = true; + } + if ((dstTest == 1) && isAMD) { + persistent[1] = true; + } else if (dstTest == 2) { + allocHostPtr[1] = true; + } else if (dstTest == 3) { + useHostPtr[1] = true; + } + + numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS)]; + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_mem_flags flags = CL_MEM_READ_ONLY; + if (persistent[0]) { + flags |= CL_MEM_USE_PERSISTENT_MEM_AMD; + } else if (allocHostPtr[0]) { + flags |= CL_MEM_ALLOC_HOST_PTR; + } else if (useHostPtr[0]) { + flags |= CL_MEM_USE_HOST_PTR; + memptr[0] = malloc(bufSize_ + 4096); + alignedmemptr[0] = (void *)(((size_t)memptr[0] + 4095) & ~4095); + } + srcBuffer_ = _wrapper->clCreateBuffer(context_, flags, bufSize_, + alignedmemptr[0], &error_); + CHECK_RESULT(srcBuffer_ == 0, "clCreateBuffer(srcBuffer) failed"); + void *mem; + mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, srcBuffer_, CL_TRUE, + CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL, + &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + setData(mem, bufSize_, 0x600df00d); + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, srcBuffer_, mem, 0, NULL, NULL); + + flags = CL_MEM_WRITE_ONLY; + if (persistent[1]) { + flags |= CL_MEM_USE_PERSISTENT_MEM_AMD; + } else if (allocHostPtr[1]) { + flags |= CL_MEM_ALLOC_HOST_PTR; + } else if (useHostPtr[1]) { + flags |= CL_MEM_USE_HOST_PTR; + memptr[1] = malloc(bufSize_ + 4096); + alignedmemptr[1] = (void *)(((size_t)memptr[1] + 4095) & ~4095); + } + dstBuffer_ = _wrapper->clCreateBuffer(context_, flags, bufSize_, + alignedmemptr[1], &error_); + CHECK_RESULT(dstBuffer_ == 0, "clCreateBuffer(dstBuffer) failed"); + + // Force persistent memory to be on GPU + if (persistent[0]) { + cl_mem memBuffer = + _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed"); + + _wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, dstBuffer_, 0, 0, + bufSize_, 0, NULL, NULL); + _wrapper->clFinish(cmd_queue_); + + _wrapper->clReleaseMemObject(memBuffer); + } + if (persistent[1]) { + cl_mem memBuffer = + _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed"); + + _wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, memBuffer, 0, 0, + bufSize_, 0, NULL, NULL); + _wrapper->clFinish(cmd_queue_); + + _wrapper->clReleaseMemObject(memBuffer); + } +} + +void OCLPerfBufferCopySpeed::run(void) { + CPerfCounter timer; + + // Warm up + error_ = _wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, dstBuffer_, 0, + 0, bufSize_, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueCopyBuffer failed"); + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < numIter; i++) { + error_ = _wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, dstBuffer_, + 0, 0, bufSize_, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueCopyBuffer failed"); + } + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Buffer copy bandwidth in GB/s + double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec; + + void *mem; + mem = + _wrapper->clEnqueueMapBuffer(cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ, + 0, bufSize_, 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + checkData(mem, bufSize_, 0x600df00d); + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL); + + const char *strSrc = NULL; + const char *strDst = NULL; + if (persistent[0]) + strSrc = "per"; + else if (allocHostPtr[0]) + strSrc = "AHP"; + else if (useHostPtr[0]) + strSrc = "UHP"; + else + strSrc = "dev"; + if (persistent[1]) + strDst = "per"; + else if (allocHostPtr[1]) + strDst = "AHP"; + else if (useHostPtr[1]) + strDst = "UHP"; + else + strDst = "dev"; + // Double results when src and dst are both on device + if ((persistent[0] || (!allocHostPtr[0] && !useHostPtr[0])) && + (persistent[1] || (!allocHostPtr[1] && !useHostPtr[1]))) + perf *= 2.0; + // Double results when src and dst are both in sysmem + if ((allocHostPtr[0] || useHostPtr[0]) && (allocHostPtr[1] || useHostPtr[1])) + perf *= 2.0; + _perfInfo = (float)perf; + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%8d bytes) s:%s d:%s i:%4d (GB/s) ", bufSize_, + strSrc, strDst, numIter); + testDescString = buf; +} + +unsigned int OCLPerfBufferCopySpeed::close(void) { + if (srcBuffer_) { + error_ = _wrapper->clReleaseMemObject(srcBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(srcBuffer_) failed"); + } + if (dstBuffer_) { + error_ = _wrapper->clReleaseMemObject(dstBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(dstBuffer_) failed"); + } + if (memptr[0]) { + free(memptr[0]); + } + if (memptr[1]) { + free(memptr[1]); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} + +void OCLPerfBufferCopyRectSpeed::run(void) { + CPerfCounter timer; + size_t width = static_cast(sqrt(static_cast(bufSize_))); + size_t srcOrigin[3] = {0, 0, 0}; + size_t dstOrigin[3] = {0, 0, 0}; + size_t region[3] = {width, width, 1}; + // Clamp iteration count for non-local writes to shorten test runtime + unsigned int testNumIter = numIter; + + if (allocHostPtr[1]) { + testNumIter = (numIter < 100 ? numIter : 100); + } + + // Skip for 1.0 platforms + if ((platformVersion[0] == '1') && (platformVersion[2] == '0')) { + char buf[256]; + SNPRINTF(buf, sizeof(buf), " SKIPPED "); + testDescString = buf; + return; + } + // Warm up + error_ = _wrapper->clEnqueueCopyBufferRect(cmd_queue_, srcBuffer_, dstBuffer_, + srcOrigin, dstOrigin, region, + width, 0, width, 0, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueCopyBufferRect failed"); + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < testNumIter; i++) { + error_ = _wrapper->clEnqueueCopyBufferRect( + cmd_queue_, srcBuffer_, dstBuffer_, srcOrigin, dstOrigin, region, width, + 0, width, 0, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueCopyBufferRect failed"); + } + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Buffer copy bandwidth in GB/s + double perf = ((double)bufSize_ * testNumIter * (double)(1e-09)) / sec; + + const char *strSrc = NULL; + const char *strDst = NULL; + if (persistent[0]) + strSrc = "per"; + else if (allocHostPtr[0]) + strSrc = "AHP"; + else if (useHostPtr[0]) + strSrc = "UHP"; + else + strSrc = "dev"; + if (persistent[1]) + strDst = "per"; + else if (allocHostPtr[1]) + strDst = "AHP"; + else if (useHostPtr[1]) + strDst = "UHP"; + else + strDst = "dev"; + // Double results when src and dst are both on device + if ((persistent[0] || (!allocHostPtr[0] && !useHostPtr[0])) && + (persistent[1] || (!allocHostPtr[1] && !useHostPtr[1]))) + perf *= 2.0; + // Double results when src and dst are both in sysmem + if ((allocHostPtr[0] || useHostPtr[0]) && (allocHostPtr[1] || useHostPtr[1])) + perf *= 2.0; + _perfInfo = (float)perf; + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%8d bytes) s:%s d:%s i:%4d (GB/s) ", bufSize_, + strSrc, strDst, testNumIter); + testDescString = buf; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopySpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopySpeed.h new file mode 100644 index 0000000000..7599cecfbd --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferCopySpeed.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_BufferCopySpeed_H_ +#define _OCL_BufferCopySpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerfBufferCopySpeed : public OCLTestImp { + public: + OCLPerfBufferCopySpeed(); + virtual ~OCLPerfBufferCopySpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 1000; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_mem srcBuffer_; + cl_mem dstBuffer_; + cl_int error_; + + unsigned int bufSize_; + bool persistent[2]; + bool allocHostPtr[2]; + bool useHostPtr[2]; + unsigned int numIter; + bool isAMD; + char platformVersion[32]; + void setData(void* ptr, unsigned int size, unsigned int value); + void checkData(void* ptr, unsigned int size, unsigned int value); + void* memptr[2]; + void* alignedmemptr[2]; +}; + +class OCLPerfBufferCopyRectSpeed : public OCLPerfBufferCopySpeed { + public: + OCLPerfBufferCopyRectSpeed() : OCLPerfBufferCopySpeed() {} + + public: + virtual void run(void); +}; +#endif // _OCL_BufferCopySpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferReadSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferReadSpeed.cpp new file mode 100644 index 0000000000..ca076d3c6e --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferReadSpeed.cpp @@ -0,0 +1,334 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfBufferReadSpeed.h" + +#include +#include +#include + +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 8 +// 256KB, 1 MB, 4MB, 16 MB +static const unsigned int Sizes[NUM_SIZES] = { + 1024, 32 * 1024, 64 * 1024, 128 * 1024, 262144, 1048576, 4194304, 16777216}; + +static cl_uint blockedSubtests; + +static const unsigned int Iterations[2] = {1, OCLPerfBufferReadSpeed::NUM_ITER}; +#define NUM_OFFSETS 1 +static const unsigned int offsets[NUM_OFFSETS] = {0}; +#define NUM_SUBTESTS (3 + NUM_OFFSETS) +extern const char *blkStr[2]; + +OCLPerfBufferReadSpeed::OCLPerfBufferReadSpeed() { + _numSubTests = NUM_SIZES * NUM_SUBTESTS * 2; + blockedSubtests = _numSubTests; + _numSubTests += NUM_SIZES * NUM_SUBTESTS; +} + +OCLPerfBufferReadSpeed::~OCLPerfBufferReadSpeed() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfBufferReadSpeed::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + outBuffer_ = 0; + persistent = false; + allocHostPtr = false; + useHostPtr = false; + hostMem = NULL; + alignedMem = NULL; + alignment = 4096; + isAMD = false; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + isAMD = true; + } + // platform = platforms[_platformIndex]; + // break; + } +#if 0 + } +#endif + delete platforms; + } + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + char getVersion[128]; + error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION, + sizeof(getVersion), getVersion, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed"); + platformVersion[0] = getVersion[7]; + platformVersion[1] = getVersion[8]; + platformVersion[2] = getVersion[9]; + platformVersion[3] = '\0'; + bufSize_ = Sizes[_openTest % NUM_SIZES]; + + if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 2) { + useHostPtr = true; + offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 3]; + } else if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 2) && isAMD) { + persistent = true; + } else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 1) { + allocHostPtr = true; + } + + if (_openTest < blockedSubtests) { + numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS)]; + } else { + numIter = + 4 * OCLPerfBufferReadSpeed::NUM_ITER / ((_openTest % NUM_SIZES) + 1); + } + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_mem_flags flags = CL_MEM_WRITE_ONLY; + if (persistent) { + flags |= CL_MEM_USE_PERSISTENT_MEM_AMD; + } else if (allocHostPtr) { + flags |= CL_MEM_ALLOC_HOST_PTR; + } else if (useHostPtr) { + flags |= CL_MEM_USE_HOST_PTR; + hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset); + CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed"); + alignedMem = + (char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) + + offset); + } + outBuffer_ = + _wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + // Force memory to be on GPU if possible + { + cl_mem memBuffer = + _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed"); + + _wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, outBuffer_, 0, 0, + bufSize_, 0, NULL, NULL); + _wrapper->clFinish(cmd_queue_); + + _wrapper->clReleaseMemObject(memBuffer); + } +} + +void OCLPerfBufferReadSpeed::run(void) { + CPerfCounter timer; + char *mem = new char[bufSize_]; + cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE; + + // Warm up + error_ = _wrapper->clEnqueueReadBuffer(cmd_queue_, outBuffer_, CL_TRUE, 0, + bufSize_, mem, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueReadBuffer failed"); + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < numIter; i++) { + error_ = _wrapper->clEnqueueReadBuffer(cmd_queue_, outBuffer_, blocking, 0, + bufSize_, mem, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueReadBuffer failed"); + } + if (blocking != CL_TRUE) { + _wrapper->clFinish(cmd_queue_); + } + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Buffer read bandwidth in GB/s + double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; + char str[256]; + if (persistent) { + SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)"); + } else if (allocHostPtr) { + SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)"); + } else if (useHostPtr) { + SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset); + } else { + SNPRINTF(str, sizeof(str), "(GB/s)"); + } + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %29s ", bufSize_, + blkStr[blocking], numIter, str); + testDescString = buf; + + delete mem; +} + +unsigned int OCLPerfBufferReadSpeed::close(void) { + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + if (hostMem) { + free(hostMem); + } + + return _crcword; +} + +void OCLPerfBufferReadRectSpeed::run(void) { + CPerfCounter timer; + char *mem = new char[bufSize_]; + size_t width = static_cast(sqrt(static_cast(bufSize_))); + size_t bufOrigin[3] = {0, 0, 0}; + size_t hostOrigin[3] = {0, 0, 0}; + size_t region[3] = {width, width, 1}; + cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE; + + // Clamp iterations to reduce run time + unsigned int testNumIter; + testNumIter = (numIter < 100 ? numIter : 100); + + // Skip for 1.0 platforms + if ((platformVersion[0] == '1') && (platformVersion[2] == '0')) { + char buf[256]; + SNPRINTF(buf, sizeof(buf), " SKIPPED "); + testDescString = buf; + return; + } + // Warm up + error_ = _wrapper->clEnqueueReadBufferRect( + cmd_queue_, outBuffer_, CL_TRUE, bufOrigin, hostOrigin, region, width, 0, + width, 0, mem, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueReadBufferRect failed"); + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < testNumIter; i++) { + error_ = _wrapper->clEnqueueReadBufferRect( + cmd_queue_, outBuffer_, blocking, bufOrigin, hostOrigin, region, width, + 0, width, 0, mem, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueReadBufferRect failed"); + } + if (blocking != CL_TRUE) { + _wrapper->clFinish(cmd_queue_); + } + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Buffer read bandwidth in GB/s + double perf = ((double)bufSize_ * testNumIter * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; + char str[256]; + if (persistent) { + SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)"); + } else if (allocHostPtr) { + SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)"); + } else if (useHostPtr) { + SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset); + } else { + SNPRINTF(str, sizeof(str), "(GB/s)"); + } + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %29s ", bufSize_, + blkStr[blocking], numIter, str); + testDescString = buf; + + delete mem; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferReadSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferReadSpeed.h new file mode 100644 index 0000000000..01df4a5815 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferReadSpeed.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_BufferReadSpeed_H_ +#define _OCL_BufferReadSpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerfBufferReadSpeed : public OCLTestImp { + public: + OCLPerfBufferReadSpeed(); + virtual ~OCLPerfBufferReadSpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 1000; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int bufSize_; + bool persistent; + bool allocHostPtr; + bool useHostPtr; + unsigned int numIter; + char* hostMem; + char* alignedMem; + size_t alignment; + unsigned int offset; + bool isAMD; + char platformVersion[32]; +}; + +class OCLPerfBufferReadRectSpeed : public OCLPerfBufferReadSpeed { + public: + OCLPerfBufferReadRectSpeed() : OCLPerfBufferReadSpeed() {} + + public: + virtual void run(void); +}; + +#endif // _OCL_BufferReadSpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferWriteSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferWriteSpeed.cpp new file mode 100644 index 0000000000..76cae8dfc3 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferWriteSpeed.cpp @@ -0,0 +1,333 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfBufferWriteSpeed.h" + +#include +#include +#include + +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 8 +// 256KB, 1 MB, 4MB, 16 MB +static const unsigned int Sizes[NUM_SIZES] = { + 1024, 32 * 1024, 64 * 1024, 128 * 1024, 262144, 1048576, 4194304, 16777216}; + +static cl_uint blockedSubtests; + +static const unsigned int Iterations[2] = {1, + OCLPerfBufferWriteSpeed::NUM_ITER}; + +#define NUM_OFFSETS 1 +static const unsigned int offsets[NUM_OFFSETS] = {0}; +#define NUM_SUBTESTS (3 + NUM_OFFSETS) +extern const char *blkStr[2]; + +OCLPerfBufferWriteSpeed::OCLPerfBufferWriteSpeed() { + _numSubTests = NUM_SIZES * NUM_SUBTESTS * 2; + blockedSubtests = _numSubTests; + _numSubTests += NUM_SIZES * NUM_SUBTESTS; +} + +OCLPerfBufferWriteSpeed::~OCLPerfBufferWriteSpeed() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfBufferWriteSpeed::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + outBuffer_ = 0; + persistent = false; + allocHostPtr = false; + useHostPtr = false; + hostMem = NULL; + alignedMem = NULL; + alignment = 4096; + isAMD = false; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + isAMD = true; + } + // platform = platforms[_platformIndex]; + // break; + } +#if 0 + } +#endif + delete platforms; + } + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + char getVersion[128]; + error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION, + sizeof(getVersion), getVersion, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed"); + platformVersion[0] = getVersion[7]; + platformVersion[1] = getVersion[8]; + platformVersion[2] = getVersion[9]; + platformVersion[3] = '\0'; + bufSize_ = Sizes[_openTest % NUM_SIZES]; + + if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 2) { + useHostPtr = true; + offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 3]; + } else if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 2) && isAMD) { + persistent = true; + } else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 1) { + allocHostPtr = true; + } + + if (_openTest < blockedSubtests) { + numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS)]; + } else { + numIter = + 4 * OCLPerfBufferWriteSpeed::NUM_ITER / ((_openTest % NUM_SIZES) + 1); + } + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_mem_flags flags = CL_MEM_READ_ONLY; + if (persistent) { + flags |= CL_MEM_USE_PERSISTENT_MEM_AMD; + } else if (allocHostPtr) { + flags |= CL_MEM_ALLOC_HOST_PTR; + } else if (useHostPtr) { + flags |= CL_MEM_USE_HOST_PTR; + hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset); + CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed"); + alignedMem = + (char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) + + offset); + } + outBuffer_ = + _wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + // Force memory to be on GPU if possible + { + cl_mem memBuffer = + _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed"); + + _wrapper->clEnqueueCopyBuffer(cmd_queue_, outBuffer_, memBuffer, 0, 0, + bufSize_, 0, NULL, NULL); + _wrapper->clFinish(cmd_queue_); + + _wrapper->clReleaseMemObject(memBuffer); + } +} + +void OCLPerfBufferWriteSpeed::run(void) { + CPerfCounter timer; + char *mem = new char[bufSize_]; + cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE; + + // Warm up + error_ = _wrapper->clEnqueueWriteBuffer(cmd_queue_, outBuffer_, CL_TRUE, 0, + bufSize_, mem, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueReadBuffer failed"); + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < numIter; i++) { + error_ = _wrapper->clEnqueueWriteBuffer(cmd_queue_, outBuffer_, blocking, 0, + bufSize_, mem, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueReadBuffer failed"); + } + if (blocking != CL_TRUE) { + _wrapper->clFinish(cmd_queue_); + } + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Buffer write bandwidth in GB/s + double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; + char str[256]; + if (persistent) { + SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)"); + } else if (allocHostPtr) { + SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)"); + } else if (useHostPtr) { + SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset); + } else { + SNPRINTF(str, sizeof(str), "(GB/s)"); + } + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %29s ", bufSize_, + blkStr[blocking], numIter, str); + testDescString = buf; + + delete mem; +} + +unsigned int OCLPerfBufferWriteSpeed::close(void) { + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + if (hostMem) { + free(hostMem); + } + + return _crcword; +} + +void OCLPerfBufferWriteRectSpeed::run(void) { + CPerfCounter timer; + char *mem = new char[bufSize_]; + size_t width = static_cast(sqrt(static_cast(bufSize_))); + size_t bufOrigin[3] = {0, 0, 0}; + size_t hostOrigin[3] = {0, 0, 0}; + size_t region[3] = {width, width, 1}; + cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE; + + // Skip for 1.0 platforms + if ((platformVersion[0] == '1') && (platformVersion[2] == '0')) { + char buf[256]; + SNPRINTF(buf, sizeof(buf), " SKIPPED "); + testDescString = buf; + return; + } + // Warm up + error_ = _wrapper->clEnqueueWriteBufferRect( + cmd_queue_, outBuffer_, CL_TRUE, bufOrigin, hostOrigin, region, width, 0, + width, 0, mem, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueReadBufferRect failed"); + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < numIter; i++) { + error_ = _wrapper->clEnqueueWriteBufferRect( + cmd_queue_, outBuffer_, blocking, bufOrigin, hostOrigin, region, width, + 0, width, 0, mem, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueReadBufferRect failed"); + } + if (blocking != CL_TRUE) { + _wrapper->clFinish(cmd_queue_); + } + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Buffer write bandwidth in GB/s + double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; + char str[256]; + if (persistent) { + SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)"); + } else if (allocHostPtr) { + SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)"); + } else if (useHostPtr) { + SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset); + } else { + SNPRINTF(str, sizeof(str), "(GB/s)"); + } + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %29s ", bufSize_, + blkStr[blocking], numIter, str); + testDescString = buf; + + delete mem; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferWriteSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferWriteSpeed.h new file mode 100644 index 0000000000..19e062d172 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfBufferWriteSpeed.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_BufferWriteSpeed_H_ +#define _OCL_BufferWriteSpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerfBufferWriteSpeed : public OCLTestImp { + public: + OCLPerfBufferWriteSpeed(); + virtual ~OCLPerfBufferWriteSpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 1000; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int bufSize_; + bool persistent; + bool allocHostPtr; + bool useHostPtr; + unsigned int numIter; + char* hostMem; + char* alignedMem; + size_t alignment; + unsigned int offset; + bool isAMD; + char platformVersion[32]; +}; + +class OCLPerfBufferWriteRectSpeed : public OCLPerfBufferWriteSpeed { + public: + OCLPerfBufferWriteRectSpeed() : OCLPerfBufferWriteSpeed() {} + + public: + virtual void run(void); +}; + +#endif // _OCL_BufferWriteSpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCPUMemSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCPUMemSpeed.cpp new file mode 100644 index 0000000000..3e108f5b48 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCPUMemSpeed.cpp @@ -0,0 +1,304 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfCPUMemSpeed.h" + +#include +#include +#include + +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 4 +// 256KB, 1 MB, 4MB, 16 MB +static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304, + 16777216}; + +#define ITER_COUNT 2 +static const unsigned int Iterations[2] = {1, OCLPerfCPUMemSpeed::NUM_ITER}; +#define NUM_OFFSETS 1 +static const unsigned int offsets[NUM_OFFSETS] = {0}; +#define NUM_SUBTESTS (3 + NUM_OFFSETS) +OCLPerfCPUMemSpeed::OCLPerfCPUMemSpeed() { + _numSubTests = NUM_SIZES * NUM_SUBTESTS * ITER_COUNT * 3; +} + +OCLPerfCPUMemSpeed::~OCLPerfCPUMemSpeed() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfCPUMemSpeed::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + outBuffer_ = 0; + persistent = false; + allocHostPtr = false; + useHostPtr = false; + hostMem = NULL; + alignedMem = NULL; + alignment = 4096; + testMemset = false; + isAMD = false; + gpuSrc = false; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + isAMD = true; + } + + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + CHECK_RESULT(num_devices == 0, "No devices found, cannot proceed"); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + // if (num_devices > 0) + //{ + // platform = platforms[_platformIndex]; + // break; + //} +#if 0 + } +#endif + delete platforms; + } + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + bufSize_ = Sizes[_openTest % NUM_SIZES]; + if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 2) { + useHostPtr = true; + offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 3]; + } else if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 2) && isAMD) { + persistent = true; + } else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 1) { + allocHostPtr = true; + } + + numIter = Iterations[(_openTest / (NUM_SIZES * NUM_SUBTESTS)) % 2]; + if (_openTest >= (NUM_SIZES * NUM_SUBTESTS * ITER_COUNT * 2)) + testMemset = true; + else if (_openTest >= (NUM_SIZES * NUM_SUBTESTS * ITER_COUNT)) { + gpuSrc = true; + numIter = std::min(numIter, 10u); + } + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_mem_flags flags; + if (gpuSrc) { + flags = CL_MEM_WRITE_ONLY; + mapFlags = CL_MAP_READ; + } else { + flags = CL_MEM_READ_ONLY; + mapFlags = CL_MAP_WRITE; + } + if (persistent) { + flags |= CL_MEM_USE_PERSISTENT_MEM_AMD; + } else if (allocHostPtr) { + flags |= CL_MEM_ALLOC_HOST_PTR; + } else if (useHostPtr) { + flags |= CL_MEM_USE_HOST_PTR; + hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset); + CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed"); + alignedMem = + (char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) + + offset); + } + outBuffer_ = + _wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + // Force memory to be on GPU if possible + { + cl_mem memBuffer = + _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed"); + + _wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, outBuffer_, 0, 0, + bufSize_, 0, NULL, NULL); + _wrapper->clFinish(cmd_queue_); + + _wrapper->clReleaseMemObject(memBuffer); + } +} + +void OCLPerfCPUMemSpeed::run(void) { + CPerfCounter timer; + + void *mem; + // Warm up + mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE, mapFlags, + 0, bufSize_, 0, NULL, NULL, &error_); + + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed"); + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + + mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE, mapFlags, + 0, bufSize_, 0, NULL, NULL, &error_); + + char *cpumem = new char[bufSize_]; + + timer.Reset(); + timer.Start(); + if (testMemset) { + for (unsigned int i = 0; i < numIter; i++) { + memset(mem, 0, bufSize_); + } + } else { + if (gpuSrc) { + for (unsigned int i = 0; i < numIter; i++) { + memcpy((void *)cpumem, mem, bufSize_); + } + } else { + for (unsigned int i = 0; i < numIter; i++) { + memcpy(mem, (void *)cpumem, bufSize_); + } + } + } + + timer.Stop(); + + delete[] cpumem; + + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed"); + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + + double sec = timer.GetElapsedTime(); + + // Map read bandwidth in GB/s + double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec; + _perfInfo = (float)perf; + + char str[256]; + if (persistent) { + SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)"); + } else if (allocHostPtr) { + SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)"); + } else if (useHostPtr) { + SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset); + } else { + SNPRINTF(str, sizeof(str), "(GB/s)"); + } + const char *str2 = NULL; + if (testMemset) + str2 = "memset to dev"; + else { + if (gpuSrc) + str2 = "memcpy from dev"; + else + str2 = "memcpy to dev"; + } + + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%8d bytes) %15s i: %4d %29s ", bufSize_, str2, + numIter, str); + testDescString = buf; +} + +unsigned int OCLPerfCPUMemSpeed::close(void) { + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + if (hostMem) { + free(hostMem); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCPUMemSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCPUMemSpeed.h new file mode 100644 index 0000000000..3313d53795 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCPUMemSpeed.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_CPUMemSpeed_H_ +#define _OCL_CPUMemSpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerfCPUMemSpeed : public OCLTestImp { + public: + OCLPerfCPUMemSpeed(); + virtual ~OCLPerfCPUMemSpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 100; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int bufSize_; + bool persistent; + bool allocHostPtr; + bool useHostPtr; + unsigned int numIter; + bool testMemset; + char* hostMem; + char* alignedMem; + size_t alignment; + unsigned int offset; + bool isAMD; + bool gpuSrc; + cl_map_flags mapFlags; +}; + +#endif // _OCL_CPUMemSpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCommandQueue.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCommandQueue.cpp new file mode 100644 index 0000000000..81b2b676ae --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCommandQueue.cpp @@ -0,0 +1,146 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfCommandQueue.h" + +#include +#include +#include + +#include +#include + +#include "CL/cl.h" +#include "CL/cl_ext.h" + +static const size_t BufSize = 0x1000; +static const size_t Iterations = 0x100; +static const size_t TotalQueues = 4; +static const size_t TotalBufs = 4; + +OCLPerfCommandQueue::OCLPerfCommandQueue() { + _numSubTests = TotalQueues * TotalBufs; + failed_ = false; +} + +OCLPerfCommandQueue::~OCLPerfCommandQueue() {} + +void OCLPerfCommandQueue::open(unsigned int test, char* units, + double& conversion, unsigned int deviceId) { + cl_mem buffer; + _deviceId = deviceId; + CPerfCounter timer; + timer.Reset(); + timer.Start(); + + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + timer.Stop(); + if (test == 0) { + printf("Runtime load/init time: %0.2f ms\n", + static_cast(timer.GetElapsedTime() * 1000)); + } + test_ = test; + cl_device_type deviceType; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE, + sizeof(deviceType), &deviceType, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed"); + + if (!(deviceType & CL_DEVICE_TYPE_GPU)) { + printf("GPU device is required for this test!\n"); + failed_ = true; + return; + } + static const size_t MemObjects[] = {1, 100, 1000, 5000}; + size_t numMems = MemObjects[test_ / TotalBufs]; + size_t bufSize = BufSize * sizeof(cl_int4); + for (size_t b = 0; b < numMems; ++b) { + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, bufSize, + NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + } +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLPerfCommandQueue::run(void) { + if (failed_) { + return; + } + unsigned int* values; + values = reinterpret_cast(new cl_int4[BufSize]); + CPerfCounter timer; + static const size_t Queues[] = {1, 2, 4, 8}; + size_t numQueues = Queues[test_ % TotalQueues]; + + // Clear destination buffer + memset(values, 0, BufSize * sizeof(cl_int4)); + + size_t iter = + Iterations / (numQueues * ((size_t)1 << (test_ / TotalBufs + 1))); + std::vector cmdQueues(numQueues); + + timer.Reset(); + timer.Start(); + + for (size_t i = 0; i < iter; ++i) { + for (size_t q = 0; q < numQueues; ++q) { + cl_command_queue cmdQueue = _wrapper->clCreateCommandQueue( + context_, devices_[_deviceId], 0, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed"); + cmdQueues[q] = cmdQueue; + } + timer.Stop(); + for (size_t q = 0; q < numQueues; ++q) { + for (size_t b = 0; b < buffers_.size(); ++b) { + error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues[q], buffers_[b], + CL_TRUE, 0, sizeof(cl_int4), + values, 0, NULL, NULL); + } + } + timer.Start(); + for (size_t q = 0; q < numQueues; ++q) { + error_ = _wrapper->clReleaseCommandQueue(cmdQueues[q]); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), + "clReleaseCommandQueue() failed"); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed"); + } + + timer.Stop(); + + std::stringstream stream; + + stream << "Create+destroy time for " << numQueues << " queues and " + << buffers_.size() << " buffers"; + stream.precision(3); + stream.width(5); + stream.setf(std::ios::fixed, std::ios::floatfield); + stream << "(ms)"; + testDescString = stream.str(); + _perfInfo = + static_cast(timer.GetElapsedTime() * 1000 / (iter * numQueues)); + delete[] values; +} + +unsigned int OCLPerfCommandQueue::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCommandQueue.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCommandQueue.h new file mode 100644 index 0000000000..cd6f710a18 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfCommandQueue.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PERF_COMMAND_QUEUE_H_ +#define _OCL_PERF_COMMAND_QUEUE_H_ + +#include "OCLTestImp.h" + +class OCLPerfCommandQueue : public OCLTestImp { + public: + OCLPerfCommandQueue(); + virtual ~OCLPerfCommandQueue(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; + unsigned int test_; +}; + +#endif // _OCL_PERF_COMMAND_QUEUE_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfConcurrency.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfConcurrency.cpp new file mode 100644 index 0000000000..0c4ba342ef --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfConcurrency.cpp @@ -0,0 +1,563 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfConcurrency.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +typedef struct { + double x; + double y; + double width; +} coordRec; + +static coordRec coords[] = { + {0.0, 0.0, 0.00001}, // All black +}; + +static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); + +static const char *float_mandel_vec = + "__kernel void mandelbrot(__global uint *out, uint width, float xPos, " + "float yPos, float xStep, float yStep, uint maxIter)\n" + "{\n" + " int tid = get_global_id(0);\n" + " int i = tid % (width/4);\n" + " int j = tid / (width/4);\n" + " int4 veci = (int4)(4*i, 4*i+1, 4*i+2, 4*i+3);\n" + " int4 vecj = (int4)(j, j, j, j);\n" + " float4 x0;\n" + " x0.s0 = (float)(xPos + xStep*veci.s0);\n" + " x0.s1 = (float)(xPos + xStep*veci.s1);\n" + " x0.s2 = (float)(xPos + xStep*veci.s2);\n" + " x0.s3 = (float)(xPos + xStep*veci.s3);\n" + " float4 y0;\n" + " y0.s0 = (float)(yPos + yStep*vecj.s0);\n" + " y0.s1 = (float)(yPos + yStep*vecj.s1);\n" + " y0.s2 = (float)(yPos + yStep*vecj.s2);\n" + " y0.s3 = (float)(yPos + yStep*vecj.s3);\n" + "\n" + " float4 x = x0;\n" + " float4 y = y0;\n" + "\n" + " uint iter = 0;\n" + " float4 tmp;\n" + " int4 stay;\n" + " int4 ccount = 0;\n" + " float4 savx = x;\n" + " float4 savy = y;\n" + " stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n" + " for (iter = 0; (stay.s0 | stay.s1 | stay.s2 | stay.s3) && (iter < " + "maxIter); iter+=16)\n" + " {\n" + " x = savx;\n" + " y = savy;\n" + "\n" + " // Two iterations\n" + " tmp = x*x + x0 - y*y;\n" + " y = 2.0f * x * y + y0;\n" + " x = tmp*tmp + x0 - y*y;\n" + " y = 2.0f * tmp * y + y0;\n" + "\n" + " // Two iterations\n" + " tmp = x*x + x0 - y*y;\n" + " y = 2.0f * x * y + y0;\n" + " x = tmp*tmp + x0 - y*y;\n" + " y = 2.0f * tmp * y + y0;\n" + "\n" + " // Two iterations\n" + " tmp = x*x + x0 - y*y;\n" + " y = 2.0f * x * y + y0;\n" + " x = tmp*tmp + x0 - y*y;\n" + " y = 2.0f * tmp * y + y0;\n" + "\n" + " // Two iterations\n" + " tmp = x*x + x0 - y*y;\n" + " y = 2.0f * x * y + y0;\n" + " x = tmp*tmp + x0 - y*y;\n" + " y = 2.0f * tmp * y + y0;\n" + "\n" + " // Two iterations\n" + " tmp = x*x + x0 - y*y;\n" + " y = 2.0f * x * y + y0;\n" + " x = tmp*tmp + x0 - y*y;\n" + " y = 2.0f * tmp * y + y0;\n" + "\n" + " // Two iterations\n" + " tmp = x*x + x0 - y*y;\n" + " y = 2.0f * x * y + y0;\n" + " x = tmp*tmp + x0 - y*y;\n" + " y = 2.0f * tmp * y + y0;\n" + "\n" + " // Two iterations\n" + " tmp = x*x + x0 - y*y;\n" + " y = 2.0f * x * y + y0;\n" + " x = tmp*tmp + x0 - y*y;\n" + " y = 2.0f * tmp * y + y0;\n" + "\n" + " // Two iterations\n" + " tmp = x*x + x0 - y*y;\n" + " y = 2.0f * x * y + y0;\n" + " x = tmp*tmp + x0 - y*y;\n" + " y = 2.0f * tmp * y + y0;\n" + "\n" + " stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n" + " savx = (stay ? x : savx);\n" + " savy = (stay ? y : savy);\n" + " ccount -= stay*16;\n" + " }\n" + " // Handle remainder\n" + " if (!(stay.s0 & stay.s1 & stay.s2 & stay.s3))\n" + " {\n" + " iter = 16;\n" + " do\n" + " {\n" + " x = savx;\n" + " y = savy;\n" + " // More efficient to use scalar ops here: Why?\n" + " stay.s0 = ((x.s0*x.s0+y.s0*y.s0) <= 4.0f) && (ccount.s0 < " + "maxIter);\n" + " stay.s1 = ((x.s1*x.s1+y.s1*y.s1) <= 4.0f) && (ccount.s1 < " + "maxIter);\n" + " stay.s2 = ((x.s2*x.s2+y.s2*y.s2) <= 4.0f) && (ccount.s2 < " + "maxIter);\n" + " stay.s3 = ((x.s3*x.s3+y.s3*y.s3) <= 4.0f) && (ccount.s3 < " + "maxIter);\n" + " tmp = x;\n" + " x = x*x + x0 - y*y;\n" + " y = 2.0f*tmp*y + y0;\n" + " ccount += stay;\n" + " iter--;\n" + " savx.s0 = (stay.s0 ? x.s0 : savx.s0);\n" + " savx.s1 = (stay.s1 ? x.s1 : savx.s1);\n" + " savx.s2 = (stay.s2 ? x.s2 : savx.s2);\n" + " savx.s3 = (stay.s3 ? x.s3 : savx.s3);\n" + " savy.s0 = (stay.s0 ? y.s0 : savy.s0);\n" + " savy.s1 = (stay.s1 ? y.s1 : savy.s1);\n" + " savy.s2 = (stay.s2 ? y.s2 : savy.s2);\n" + " savy.s3 = (stay.s3 ? y.s3 : savy.s3);\n" + " } while ((stay.s0 | stay.s1 | stay.s2 | stay.s3) && iter);\n" + " }\n" + " __global uint4 *vecOut = (__global uint4 *)out;\n" + " vecOut[tid] = convert_uint4(ccount);\n" + "}\n"; + +OCLPerfConcurrency::OCLPerfConcurrency() { _numSubTests = 10 * numCoords; } + +OCLPerfConcurrency::~OCLPerfConcurrency() {} + +void OCLPerfConcurrency::setData(cl_mem buffer, unsigned int val) { + unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer( + cmd_queue_[0], buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL, + &error_); + for (unsigned int i = 0; i < width_; i++) data[i] = val; + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_[0], buffer, data, 0, + NULL, NULL); + _wrapper->clFinish(cmd_queue_[0]); +} + +void OCLPerfConcurrency::checkData(cl_mem buffer) { + unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer( + cmd_queue_[0], buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL, + &error_); + totalIters = 0; + for (unsigned int i = 0; i < width_; i++) { + totalIters += data[i]; + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_[0], buffer, data, 0, + NULL, NULL); + _wrapper->clFinish(cmd_queue_[0]); +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfConcurrency::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + unsigned int i; + + if (type_ != CL_DEVICE_TYPE_GPU) { + char msg[256]; + SNPRINTF(msg, sizeof(msg), "No GPU devices present. Exiting!\t"); + testDescString = msg; + return; + } + + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + + for (i = 0; i < MAX_ASYNC_QUEUES; i++) { + cmd_queue_[i] = 0; + program_[i] = 0; + kernel_[i] = 0; + outBuffer_[i] = 0; + } + + // Maximum iteration count + // NOTE: Some kernels are unrolled 16 times, so make sure maxIter is divisible + // by 16 NOTE: Can increase to get better peak performance numbers, but be + // sure not to TDR slow ASICs! NOTE:. for warmup run we use maxIter = 256 and + // then for the actual run we use maxIter = 8388608 * (engine_clock / 1000). + maxIter = 256; + + // NOTE: Width needs to be divisible by 4 because the float_mandel_vec kernel + // processes 4 pixels at once NOTE: Can increase to get better peak + // performance numbers, but be sure not to TDR slow ASICs! + width_ = 256; + + // We compute a square domain + bufSize_ = width_ * sizeof(cl_uint); + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + // if (num_devices > 0) + //{ + // platform = platforms[_platformIndex]; + // break; + //} +#if 0 + } +#endif + delete platforms; + } + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + char charbuf[1024]; + size_t retsize; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024, + charbuf, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + cl_uint numAsyncQueues; + error_ = _wrapper->clGetDeviceInfo( + device, CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD, sizeof(numAsyncQueues), + &numAsyncQueues, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + CHECK_RESULT(numAsyncQueues > MAX_ASYNC_QUEUES, + "numAsyncQueues is too large for this test"); + + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(size_t), &numCUs, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + switch (_openTest) { + case 0: + num_cmd_queues = num_programs = num_kernels = num_outbuffers = 1; + break; + + case 1: + num_cmd_queues = 1; + num_programs = 1; + num_kernels = 1; + num_outbuffers = 2; + break; + + case 2: + num_cmd_queues = 1; + num_programs = 2; + num_kernels = 2; + num_outbuffers = 2; + break; + + case 3: + num_cmd_queues = num_programs = num_kernels = num_outbuffers = 2; + break; + + case 4: + case 5: + case 6: + case 7: + case 8: + case 9: + num_cmd_queues = num_programs = num_kernels = num_outbuffers = + numAsyncQueues % 8; + break; + + default: + break; + } + + for (i = 0; i < num_cmd_queues; i++) { + cmd_queue_[i] = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_[i] == 0, "clCreateCommandQueue failed"); + } + + for (i = 0; i < num_outbuffers; i++) { + outBuffer_[i] = + _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(outBuffer_[i] == 0, "clCreateBuffer(outBuffer) failed"); + } + + const char *tmp; + tmp = float_mandel_vec; + + for (i = 0; i < num_programs; i++) { + program_[i] = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&tmp, NULL, &error_); + CHECK_RESULT(program_[i] == 0, "clCreateProgramWithSource failed"); + + error_ = _wrapper->clBuildProgram(program_[i], 1, &device, "", NULL, NULL); + + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = _wrapper->clGetProgramBuildInfo( + program_[i], device, CL_PROGRAM_BUILD_LOG, 16384 * sizeof(char), log, + NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + } + + for (i = 0; i < num_kernels; i++) { + kernel_[i] = _wrapper->clCreateKernel(program_[i], "mandelbrot", &error_); + CHECK_RESULT(kernel_[i] == 0, "clCreateKernel failed"); + } + + coordIdx = _openTest % numCoords; + float xStep = (float)(coords[coordIdx].width / (double)width_); + float yStep = (float)(-coords[coordIdx].width / (double)width_); + float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); + float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); + + for (i = 0; i < num_kernels; i++) { + error_ = _wrapper->clSetKernelArg(kernel_[i], 0, sizeof(cl_mem), + (void *)&outBuffer_[i]); + error_ = _wrapper->clSetKernelArg(kernel_[i], 1, sizeof(cl_uint), + (void *)&width_); + error_ = _wrapper->clSetKernelArg(kernel_[i], 2, sizeof(cl_float), + (void *)&xPos); + error_ = _wrapper->clSetKernelArg(kernel_[i], 3, sizeof(cl_float), + (void *)&yPos); + error_ = _wrapper->clSetKernelArg(kernel_[i], 4, sizeof(cl_float), + (void *)&xStep); + error_ = _wrapper->clSetKernelArg(kernel_[i], 5, sizeof(cl_float), + (void *)&yStep); + error_ = _wrapper->clSetKernelArg(kernel_[i], 6, sizeof(cl_uint), + (void *)&maxIter); + } + + for (i = 0; i < num_outbuffers; i++) { + setData(outBuffer_[i], 0xdeadbeef); + } + + unsigned int clkFrequency = 0; + error_ = clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, + sizeof(clkFrequency), &clkFrequency, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + assert(clkFrequency > 0); + maxIter = + (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs) / 128); + maxIter = (maxIter + 15) & ~15; +} + +void OCLPerfConcurrency::run(void) { + // Test runs only on GPU + if (type_ != CL_DEVICE_TYPE_GPU) return; + + int global = width_ >> 2; + // We handle 4 pixels per thread + int local = 64; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + unsigned int i; + + // Warmup + for (i = 0; i < num_kernels; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_[i % num_cmd_queues], kernel_[i], 1, NULL, + (const size_t *)global_work_size, (const size_t *)local_work_size, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + } + + for (i = 0; i < num_cmd_queues; i++) { + _wrapper->clFlush(cmd_queue_[i]); + } + + for (i = 0; i < num_cmd_queues; i++) { + _wrapper->clFinish(cmd_queue_[i]); + } + + for (i = 0; i < num_kernels; i++) { + error_ = _wrapper->clSetKernelArg(kernel_[i], 6, sizeof(cl_uint), + (void *)&maxIter); + } + + CPerfCounter timer; + + timer.Reset(); + timer.Start(); + + for (i = 0; i < num_kernels; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_[i % num_cmd_queues], kernel_[i], 1, NULL, + (const size_t *)global_work_size, (const size_t *)local_work_size, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + } + + if (_openTest == 1) { + error_ = _wrapper->clSetKernelArg(kernel_[0], 0, sizeof(cl_mem), + (void *)&outBuffer_[1]); + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_[0], kernel_[0], 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + } + + for (i = 0; i < num_cmd_queues; i++) { + _wrapper->clFlush(cmd_queue_[i]); + } + + for (i = 0; i < num_cmd_queues; i++) { + _wrapper->clFinish(cmd_queue_[i]); + } + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + unsigned long long expected = + (unsigned long long)width_ * (unsigned long long)maxIter; + + for (i = 0; i < num_outbuffers; i++) { + checkData(outBuffer_[i]); + CHECK_RESULT(totalIters != expected, "Incorrect iteration count detected!"); + } + + _perfInfo = (float)sec; + if (_openTest == 0) + testDescString = "time for 1 kernel (s) "; + else if (_openTest == 1) + testDescString = "time for 2 kernels (s) (same kernel) "; + else if (_openTest == 2) + testDescString = "time for 2 kernels (s) (diff kernels)"; + else { + char buf[128]; + SNPRINTF(buf, sizeof(buf), "time for %d kernels (s) ( %d queues) ", + num_kernels, num_cmd_queues); + testDescString = buf; + } +} + +unsigned int OCLPerfConcurrency::close(void) { + unsigned int i; + + // Test runs only on GPU + if (type_ != CL_DEVICE_TYPE_GPU) return 0; + + _wrapper->clFinish(cmd_queue_[0]); + + for (i = 0; i < num_outbuffers; i++) { + error_ = _wrapper->clReleaseMemObject(outBuffer_[i]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + + for (i = 0; i < num_kernels; i++) { + error_ = _wrapper->clReleaseKernel(kernel_[i]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseKernel(kernel_) failed"); + } + + for (i = 0; i < num_programs; i++) { + error_ = _wrapper->clReleaseProgram(program_[i]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseProgram(program_) failed"); + } + + for (i = 0; i < num_cmd_queues; i++) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_[i]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfConcurrency.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfConcurrency.h new file mode 100644 index 0000000000..850e146b04 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfConcurrency.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_Perf_Concurrency_H_ +#define _OCL_Perf_Concurrency_H_ + +#include "OCLTestImp.h" + +class OCLPerfConcurrency : public OCLTestImp { + public: + OCLPerfConcurrency(); + virtual ~OCLPerfConcurrency(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void setData(cl_mem buffer, unsigned int data); + void checkData(cl_mem buffer); + +#define MAX_ASYNC_QUEUES 8 + + cl_context context_; + cl_command_queue cmd_queue_[MAX_ASYNC_QUEUES]; + cl_program program_[MAX_ASYNC_QUEUES]; + cl_kernel kernel_[MAX_ASYNC_QUEUES]; + cl_mem outBuffer_[MAX_ASYNC_QUEUES]; + cl_int error_; + + unsigned int num_cmd_queues; + unsigned int num_programs; + unsigned int num_kernels; + unsigned int num_outbuffers; + + unsigned int width_; + unsigned int bufSize_; + unsigned int maxIter; + unsigned int coordIdx; + unsigned long long totalIters; + size_t numCUs; +}; + +#endif // _OCL_Perf_Concurrency_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemReadSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemReadSpeed.cpp new file mode 100644 index 0000000000..deb61efa8b --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemReadSpeed.cpp @@ -0,0 +1,243 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfDevMemReadSpeed.h" + +#include +#include +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 1 +static const unsigned int Sizes[NUM_SIZES] = {256 * 1024 * 1024}; + +const static char *strKernel = + "__kernel void read_kernel(__global uint16 *src, ulong size1, uint " + "threads, __global uint* dst\n" + " )\n" + "{\n" + " uint16 pval;\n" + " int idx = get_global_id(0);\n" + " __global uint16 *srcEnd = src + size1;\n" + " uint tmp = 0;\n" + " src = &src[idx];" + " while (src < srcEnd) \n" + " {\n" + " pval = *src;\n" + " src += threads;\n" + " tmp += pval.s0 + pval.s1 + pval.s2 + pval.s3 + pval.s4 + pval.s5 + pval.s6 + \ + pval.s7 + pval.s8 + pval.s9 + pval.sa + pval.sb + pval.sc + pval.sd + pval.se + pval.sf;\n" + " }\n" + " atomic_add(dst, tmp);\n" + "}\n"; + +OCLPerfDevMemReadSpeed::OCLPerfDevMemReadSpeed() { _numSubTests = 1; } + +OCLPerfDevMemReadSpeed::~OCLPerfDevMemReadSpeed() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfDevMemReadSpeed::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + error_ = CL_SUCCESS; + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + program_ = 0; + kernel_ = 0; + skip_ = false; + dstBuffer_ = 0; + nBytes = Sizes[0]; + cl_ulong loopCnt = nBytes / (16 * sizeof(cl_uint)); + cl_uint maxCUs; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], + CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(cl_uint), &maxCUs, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + wgs = 64; + const static cl_uint wavesPerCU = 8; + nWorkItems = maxCUs * wavesPerCU * wgs; + + inputData = 0x1; + nIter = 1000; + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "read_kernel", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + srcBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, nBytes, + NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer(srcBuffer) failed"); + void *mem; + mem = _wrapper->clEnqueueMapBuffer(cmdQueues_[_deviceId], srcBuffer_, CL_TRUE, + CL_MAP_READ | CL_MAP_WRITE, 0, nBytes, 0, + NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + for (unsigned int i = 0; i < nBytes / sizeof(cl_uint); ++i) { + reinterpret_cast(mem)[i] = inputData; + } + + dstBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, + sizeof(cl_uint), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer(dstBuffer) failed"); + _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], srcBuffer_, mem, 0, + NULL, NULL); + mem = _wrapper->clEnqueueMapBuffer(cmdQueues_[_deviceId], dstBuffer_, CL_TRUE, + CL_MAP_READ | CL_MAP_WRITE, 0, + sizeof(cl_uint), 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + memset(mem, 0, sizeof(cl_uint)); + _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], dstBuffer_, mem, 0, + NULL, NULL); + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &srcBuffer_); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = + _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_ulong), (void *)&loopCnt); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint), + (void *)&nWorkItems); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = + _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_mem), (void *)&dstBuffer_); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); +} + +void OCLPerfDevMemReadSpeed::run(void) { + if (skip_) { + return; + } + + CPerfCounter timer; + + size_t gws[1] = {nWorkItems}; + size_t lws[1] = {wgs}; + + // warm up + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + + cl_uint *memResult; + memResult = (cl_uint *)malloc(sizeof(cl_uint)); + if (0 == memResult) { + CHECK_RESULT_NO_RETURN(0, "malloc failed!\n"); + return; + } + + memset(memResult, 0, sizeof(cl_uint)); + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], dstBuffer_, + CL_FALSE, 0, sizeof(cl_uint), + memResult, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueReadBuffer dstBuffer_ failed!"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + + if (memResult[0] != (nBytes / sizeof(cl_uint))) { + CHECK_RESULT_NO_RETURN(0, "Data validation failed for warm up run!\n"); + free(memResult); + return; + } + + free(memResult); + + timer.Reset(); + timer.Start(); + double sec2 = 0; + cl_event *events = new cl_event[nIter]; + for (unsigned int i = 0; i < nIter; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmdQueues_[_deviceId], kernel_, 1, NULL, gws, lws, 0, NULL, &events[i]); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + _wrapper->clFinish(cmdQueues_[_deviceId]); + timer.Stop(); + for (unsigned int i = 0; i < nIter; i++) { + cl_ulong startTime = 0, endTime = 0; + error_ = _wrapper->clGetEventProfilingInfo( + events[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0); + CHECK_RESULT(error_, "clGetEventProfilingInfo failed"); + error_ = _wrapper->clGetEventProfilingInfo( + events[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0); + CHECK_RESULT(error_, "clGetEventProfilingInfo failed"); + + _wrapper->clReleaseEvent(events[i]); + sec2 += endTime - startTime; + } + double sec = timer.GetElapsedTime(); + delete[] events; + + // read speed in GB/s + double perf = ((double)nBytes * nIter * (double)(1e-09)) / sec; + double perf2 = ((double)nBytes * nIter) / sec2; + _perfInfo = (float)perf2; + float perfInfo = (float)perf; + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%8d bytes) i:%4d Wall time Perf: %.2f (GB/s)", + nBytes, nIter, perfInfo); + testDescString = buf; +} + +unsigned int OCLPerfDevMemReadSpeed::close(void) { + if (!skip_) { + if (srcBuffer_) { + error_ = _wrapper->clReleaseMemObject(srcBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(srcBuffer_) failed"); + } + + if (dstBuffer_) { + error_ = _wrapper->clReleaseMemObject(dstBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(srcBuffer_) failed"); + } + } + + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemReadSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemReadSpeed.h new file mode 100644 index 0000000000..631b185229 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemReadSpeed.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_DevMemReadSpeed_H_ +#define _OCL_DevMemReadSpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerfDevMemReadSpeed : public OCLTestImp { + public: + OCLPerfDevMemReadSpeed(); + virtual ~OCLPerfDevMemReadSpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + cl_mem srcBuffer_; + cl_mem dstBuffer_; + unsigned int nWorkItems; // number of GPU work items + unsigned int wgs; // work group size + unsigned int nBytes; // input and output buffer size + unsigned int nIter; // overall number of timing loops + cl_uint inputData; // input data to fill the input buffer + bool skip_; +}; + +#endif // _OCL_DevMemReadSpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemWriteSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemWriteSpeed.cpp new file mode 100644 index 0000000000..83992db7a9 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemWriteSpeed.cpp @@ -0,0 +1,212 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfDevMemWriteSpeed.h" + +#include +#include +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 1 +static const unsigned int Sizes[NUM_SIZES] = {256 * 1024 * 1024}; + +const static char *strKernel = + + "__kernel void write_kernel(__global uint16 *dst, ulong size1, uint " + "threads\n" + " )\n" + "{\n" + " uint16 pval = (uint16)(0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab,\ + 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab);\n" + " int idx = get_global_id(0);\n" + " __global uint16 *dstEnd = dst + size1;\n" + " dst = &dst[idx];" + " do\n" + " {\n" + " *dst = pval;\n" + " dst += threads;\n" + " }\n" + " while (dst < dstEnd);\n" + "}\n"; + +OCLPerfDevMemWriteSpeed::OCLPerfDevMemWriteSpeed() { _numSubTests = 1; } + +OCLPerfDevMemWriteSpeed::~OCLPerfDevMemWriteSpeed() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfDevMemWriteSpeed::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + error_ = CL_SUCCESS; + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + program_ = 0; + kernel_ = 0; + skip_ = false; + dstBuffer_ = 0; + nBytes = Sizes[0]; + cl_ulong loopCnt = nBytes / (16 * sizeof(cl_uint)); + cl_uint maxCUs; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], + CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(cl_uint), &maxCUs, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + wgs = 64; + const static cl_uint wavesPerCU = 8; + nWorkItems = maxCUs * wavesPerCU * wgs; + inputData = 0xabababab; + nIter = 1000; + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "write_kernel", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + dstBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, nBytes, + NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer(dstBuffer) failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &dstBuffer_); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = + _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_ulong), (void *)&loopCnt); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint), + (void *)&nWorkItems); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); +} + +void OCLPerfDevMemWriteSpeed::run(void) { + if (skip_) { + return; + } + + CPerfCounter timer; + + size_t gws[1] = {nWorkItems}; + size_t lws[1] = {wgs}; + + // warm up + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + + cl_uint *memResult; + memResult = (cl_uint *)malloc(nBytes); + if (0 == memResult) { + CHECK_RESULT_NO_RETURN(0, "malloc failed!\n"); + return; + } + + memset(memResult, 0, nBytes); + error_ = + _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], dstBuffer_, CL_FALSE, + 0, nBytes, memResult, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueReadBuffer dstBuffer_ failed!"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + + for (unsigned int i = 0; i < nBytes / sizeof(cl_uint); i++) { + if (((cl_uint *)memResult)[i] != inputData) { + CHECK_RESULT_NO_RETURN(0, "Data validation failed for warm up run!\n"); + free(memResult); + return; + } + } + + free(memResult); + + timer.Reset(); + timer.Start(); + double sec2 = 0; + cl_event *events = new cl_event[nIter]; + for (unsigned int i = 0; i < nIter; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmdQueues_[_deviceId], kernel_, 1, NULL, gws, lws, 0, NULL, &events[i]); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + _wrapper->clFinish(cmdQueues_[_deviceId]); + timer.Stop(); + for (unsigned int i = 0; i < nIter; i++) { + cl_ulong startTime = 0, endTime = 0; + error_ = _wrapper->clGetEventProfilingInfo( + events[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0); + CHECK_RESULT(error_, "clGetEventProfilingInfo failed"); + error_ = _wrapper->clGetEventProfilingInfo( + events[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0); + CHECK_RESULT(error_, "clGetEventProfilingInfo failed"); + + _wrapper->clReleaseEvent(events[i]); + sec2 += endTime - startTime; + } + double sec = timer.GetElapsedTime(); + delete[] events; + + // write speed in GB/s + double perf = ((double)nBytes * nIter * (double)(1e-09)) / sec; + double perf2 = ((double)nBytes * nIter) / sec2; + _perfInfo = (float)perf2; + float perfInfo = (float)perf; + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%8d bytes) i:%4d Wall time Perf: %.2f (GB/s)", + nBytes, nIter, perfInfo); + testDescString = buf; +} + +unsigned int OCLPerfDevMemWriteSpeed::close(void) { + if (!skip_) { + if (dstBuffer_) { + error_ = _wrapper->clReleaseMemObject(dstBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(srcBuffer_) failed"); + } + } + + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemWriteSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemWriteSpeed.h new file mode 100644 index 0000000000..7bdfdc70b1 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDevMemWriteSpeed.h @@ -0,0 +1,46 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_DevMemWriteSpeed_H_ +#define _OCL_DevMemWriteSpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerfDevMemWriteSpeed : public OCLTestImp { + public: + OCLPerfDevMemWriteSpeed(); + virtual ~OCLPerfDevMemWriteSpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + cl_mem dstBuffer_; + unsigned int nWorkItems; // number of GPU work items + unsigned int wgs; // work group size + unsigned int nBytes; // output buffer size + unsigned int nIter; // overall number of timing loops + cl_uint inputData; // input data to fill the input buffer + bool skip_; +}; + +#endif // _OCL_DevMemWriteSpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceConcurrency.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceConcurrency.cpp new file mode 100644 index 0000000000..3c6c97e14a --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceConcurrency.cpp @@ -0,0 +1,480 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfDeviceConcurrency.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +typedef struct { + double x; + double y; + double width; +} coordRec; + +static coordRec coords[] = { + {0.0, 0.0, 0.00001}, // All black +}; + +static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); + +static const char *float_mandel_vec = + "__kernel void mandelbrot(__global uint *out, uint width, float xPos, " + "float yPos, float xStep, float yStep, uint maxIter)\n" + "{\n" + " int tid = get_global_id(0);\n" + " int i = tid % (width/4);\n" + " int j = tid / (width/4);\n" + " int4 veci = (int4)(4*i, 4*i+1, 4*i+2, 4*i+3);\n" + " int4 vecj = (int4)(j, j, j, j);\n" + " float4 x0;\n" + " x0.s0 = (float)(xPos + xStep*veci.s0);\n" + " x0.s1 = (float)(xPos + xStep*veci.s1);\n" + " x0.s2 = (float)(xPos + xStep*veci.s2);\n" + " x0.s3 = (float)(xPos + xStep*veci.s3);\n" + " float4 y0;\n" + " y0.s0 = (float)(yPos + yStep*vecj.s0);\n" + " y0.s1 = (float)(yPos + yStep*vecj.s1);\n" + " y0.s2 = (float)(yPos + yStep*vecj.s2);\n" + " y0.s3 = (float)(yPos + yStep*vecj.s3);\n" + "\n" + " float4 x = x0;\n" + " float4 y = y0;\n" + "\n" + " uint iter = 0;\n" + " float4 tmp;\n" + " int4 stay;\n" + " int4 ccount = 0;\n" + " float4 savx = x;\n" + " float4 savy = y;\n" + " stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n" + " for (iter = 0; (stay.s0 | stay.s1 | stay.s2 | stay.s3) && (iter < " + "maxIter); iter+=16)\n" + " {\n" + " x = savx;\n" + " y = savy;\n" + "\n" + " // Two iterations\n" + " tmp = x*x + x0 - y*y;\n" + " y = 2.0f * x * y + y0;\n" + " x = tmp*tmp + x0 - y*y;\n" + " y = 2.0f * tmp * y + y0;\n" + "\n" + " // Two iterations\n" + " tmp = x*x + x0 - y*y;\n" + " y = 2.0f * x * y + y0;\n" + " x = tmp*tmp + x0 - y*y;\n" + " y = 2.0f * tmp * y + y0;\n" + "\n" + " // Two iterations\n" + " tmp = x*x + x0 - y*y;\n" + " y = 2.0f * x * y + y0;\n" + " x = tmp*tmp + x0 - y*y;\n" + " y = 2.0f * tmp * y + y0;\n" + "\n" + " // Two iterations\n" + " tmp = x*x + x0 - y*y;\n" + " y = 2.0f * x * y + y0;\n" + " x = tmp*tmp + x0 - y*y;\n" + " y = 2.0f * tmp * y + y0;\n" + "\n" + " // Two iterations\n" + " tmp = x*x + x0 - y*y;\n" + " y = 2.0f * x * y + y0;\n" + " x = tmp*tmp + x0 - y*y;\n" + " y = 2.0f * tmp * y + y0;\n" + "\n" + " // Two iterations\n" + " tmp = x*x + x0 - y*y;\n" + " y = 2.0f * x * y + y0;\n" + " x = tmp*tmp + x0 - y*y;\n" + " y = 2.0f * tmp * y + y0;\n" + "\n" + " // Two iterations\n" + " tmp = x*x + x0 - y*y;\n" + " y = 2.0f * x * y + y0;\n" + " x = tmp*tmp + x0 - y*y;\n" + " y = 2.0f * tmp * y + y0;\n" + "\n" + " // Two iterations\n" + " tmp = x*x + x0 - y*y;\n" + " y = 2.0f * x * y + y0;\n" + " x = tmp*tmp + x0 - y*y;\n" + " y = 2.0f * tmp * y + y0;\n" + "\n" + " stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n" + " savx = (stay ? x : savx);\n" + " savy = (stay ? y : savy);\n" + " ccount -= stay*16;\n" + " }\n" + " // Handle remainder\n" + " if (!(stay.s0 & stay.s1 & stay.s2 & stay.s3))\n" + " {\n" + " iter = 16;\n" + " do\n" + " {\n" + " x = savx;\n" + " y = savy;\n" + " // More efficient to use scalar ops here: Why?\n" + " stay.s0 = ((x.s0*x.s0+y.s0*y.s0) <= 4.0f) && (ccount.s0 < " + "maxIter);\n" + " stay.s1 = ((x.s1*x.s1+y.s1*y.s1) <= 4.0f) && (ccount.s1 < " + "maxIter);\n" + " stay.s2 = ((x.s2*x.s2+y.s2*y.s2) <= 4.0f) && (ccount.s2 < " + "maxIter);\n" + " stay.s3 = ((x.s3*x.s3+y.s3*y.s3) <= 4.0f) && (ccount.s3 < " + "maxIter);\n" + " tmp = x;\n" + " x = x*x + x0 - y*y;\n" + " y = 2.0f*tmp*y + y0;\n" + " ccount += stay;\n" + " iter--;\n" + " savx.s0 = (stay.s0 ? x.s0 : savx.s0);\n" + " savx.s1 = (stay.s1 ? x.s1 : savx.s1);\n" + " savx.s2 = (stay.s2 ? x.s2 : savx.s2);\n" + " savx.s3 = (stay.s3 ? x.s3 : savx.s3);\n" + " savy.s0 = (stay.s0 ? y.s0 : savy.s0);\n" + " savy.s1 = (stay.s1 ? y.s1 : savy.s1);\n" + " savy.s2 = (stay.s2 ? y.s2 : savy.s2);\n" + " savy.s3 = (stay.s3 ? y.s3 : savy.s3);\n" + " } while ((stay.s0 | stay.s1 | stay.s2 | stay.s3) && iter);\n" + " }\n" + " __global uint4 *vecOut = (__global uint4 *)out;\n" + " vecOut[tid] = convert_uint4(ccount);\n" + "}\n"; + +OCLPerfDeviceConcurrency::OCLPerfDeviceConcurrency() { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + + platform = platforms[_platformIndex]; + num_devices = 0; + /* Get the number of requested devices */ + + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + if (num_devices > MAX_DEVICES) { + num_devices = MAX_DEVICES; + } + delete platforms; + } + _numSubTests = num_devices; +} + +OCLPerfDeviceConcurrency::~OCLPerfDeviceConcurrency() {} + +void OCLPerfDeviceConcurrency::setData(cl_mem buffer, unsigned int idx, + unsigned int val) { + unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer( + cmd_queue_[idx], buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL, + &error_); + for (unsigned int i = 0; i < width_; i++) data[i] = val; + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_[idx], buffer, data, 0, + NULL, NULL); + _wrapper->clFinish(cmd_queue_[idx]); +} + +void OCLPerfDeviceConcurrency::checkData(cl_mem buffer, unsigned int idx) { + unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer( + cmd_queue_[idx], buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL, + &error_); + totalIters = 0; + for (unsigned int i = 0; i < width_; i++) { + totalIters += data[i]; + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_[idx], buffer, data, 0, + NULL, NULL); + _wrapper->clFinish(cmd_queue_[idx]); +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfDeviceConcurrency::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + num_devices = 0; + cl_device_id *devices = NULL; + unsigned int i; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + + for (i = 0; i < MAX_DEVICES; i++) { + cmd_queue_[i] = 0; + program_[i] = 0; + kernel_[i] = 0; + outBuffer_[i] = 0; + } + + // Maximum iteration count + // NOTE: Some kernels are unrolled 16 times, so make sure maxIter is divisible + // by 16 NOTE: Can increase to get better peak performance numbers, but be + // sure not to TDR slow ASICs! NOTE:. for warmup run we use maxIter = 256 and + // then for the actual run we use maxIter = 8388608 * (engine_clock / 1000). + maxIter = 256; + + // NOTE: Width needs to be divisible by 4 because the float_mandel_vec kernel + // processes 4 pixels at once NOTE: Can increase to get better peak + // performance numbers, but be sure not to TDR slow ASICs! + width_ = 256; + + // We compute a square domain + bufSize_ = width_ * sizeof(cl_uint); + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + if (num_devices > MAX_DEVICES) { + num_devices = MAX_DEVICES; + } + delete platforms; + } + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested devices */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + context_ = _wrapper->clCreateContext(NULL, num_devices, devices, + notify_callback, NULL, &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cur_devices = _openTest + 1; + + for (i = 0; i < cur_devices; i++) { + cmd_queue_[i] = + _wrapper->clCreateCommandQueue(context_, devices[i], 0, NULL); + CHECK_RESULT(cmd_queue_[i] == 0, "clCreateCommandQueue failed"); + outBuffer_[i] = + _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(outBuffer_[i] == 0, "clCreateBuffer(outBuffer) failed"); + } + + const char *tmp; + tmp = float_mandel_vec; + + for (i = 0; i < cur_devices; i++) { + program_[i] = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&tmp, NULL, &error_); + CHECK_RESULT(program_[i] == 0, "clCreateProgramWithSource failed"); + + error_ = + _wrapper->clBuildProgram(program_[i], 1, &devices[i], "", NULL, NULL); + + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = _wrapper->clGetProgramBuildInfo( + program_[i], devices[i], CL_PROGRAM_BUILD_LOG, 16384 * sizeof(char), + log, NULL); + printf("Build error on device %d -> %s\n", i, log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + } + + for (i = 0; i < cur_devices; i++) { + kernel_[i] = _wrapper->clCreateKernel(program_[i], "mandelbrot", &error_); + CHECK_RESULT(kernel_[i] == 0, "clCreateKernel failed"); + } + + coordIdx = _openTest % numCoords; + float xStep = (float)(coords[coordIdx].width / (double)width_); + float yStep = (float)(-coords[coordIdx].width / (double)width_); + float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); + float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); + + for (i = 0; i < cur_devices; i++) { + error_ = _wrapper->clSetKernelArg(kernel_[i], 0, sizeof(cl_mem), + (void *)&outBuffer_[i]); + error_ = _wrapper->clSetKernelArg(kernel_[i], 1, sizeof(cl_uint), + (void *)&width_); + error_ = _wrapper->clSetKernelArg(kernel_[i], 2, sizeof(cl_float), + (void *)&xPos); + error_ = _wrapper->clSetKernelArg(kernel_[i], 3, sizeof(cl_float), + (void *)&yPos); + error_ = _wrapper->clSetKernelArg(kernel_[i], 4, sizeof(cl_float), + (void *)&xStep); + error_ = _wrapper->clSetKernelArg(kernel_[i], 5, sizeof(cl_float), + (void *)&yStep); + error_ = _wrapper->clSetKernelArg(kernel_[i], 6, sizeof(cl_uint), + (void *)&maxIter); + } + + for (i = 0; i < cur_devices; i++) { + setData(outBuffer_[i], i, 0xdeadbeef); + } + + cl_uint clkFrequency = 0; + error_ = clGetDeviceInfo(devices[0], CL_DEVICE_MAX_CLOCK_FREQUENCY, + sizeof(clkFrequency), &clkFrequency, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + assert(clkFrequency > 0); + maxIter = (unsigned int)(8388608 * ((float)clkFrequency / 1000)); + maxIter = (maxIter + 15) & ~15; +} + +void OCLPerfDeviceConcurrency::run(void) { + int global = width_ >> 2; + // We handle 4 pixels per thread + int local = 64; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + unsigned int i; + + // Warmup + for (i = 0; i < cur_devices; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_[i], kernel_[i], 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + } + + for (i = 0; i < cur_devices; i++) { + _wrapper->clFlush(cmd_queue_[i]); + } + + for (i = 0; i < cur_devices; i++) { + _wrapper->clFinish(cmd_queue_[i]); + } + + for (i = 0; i < cur_devices; i++) { + error_ = _wrapper->clSetKernelArg(kernel_[i], 6, sizeof(cl_uint), + (void *)&maxIter); + } + + CPerfCounter timer; + + timer.Reset(); + timer.Start(); + + for (i = 0; i < cur_devices; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_[i], kernel_[i], 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + } + + for (i = 0; i < cur_devices; i++) { + _wrapper->clFlush(cmd_queue_[i]); + } + + for (i = 0; i < cur_devices; i++) { + _wrapper->clFinish(cmd_queue_[i]); + } + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + unsigned long long expected = + (unsigned long long)width_ * (unsigned long long)maxIter; + + for (i = 0; i < cur_devices; i++) { + checkData(outBuffer_[i], i); + CHECK_RESULT(totalIters != expected, "Incorrect iteration count detected!"); + } + + _perfInfo = (float)sec; + char buf[128]; + SNPRINTF(buf, sizeof(buf), "time for %2d devices (s) (%2d queues) ", + cur_devices, cur_devices); + testDescString = buf; +} + +unsigned int OCLPerfDeviceConcurrency::close(void) { + unsigned int i; + + for (i = 0; i < cur_devices; i++) { + error_ = _wrapper->clReleaseMemObject(outBuffer_[i]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + + for (i = 0; i < cur_devices; i++) { + error_ = _wrapper->clReleaseKernel(kernel_[i]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseKernel(kernel_) failed"); + } + + for (i = 0; i < cur_devices; i++) { + error_ = _wrapper->clReleaseProgram(program_[i]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseProgram(program_) failed"); + } + + for (i = 0; i < cur_devices; i++) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_[i]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceConcurrency.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceConcurrency.h new file mode 100644 index 0000000000..eed83632a0 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceConcurrency.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_Perf_DeviceConcurrency_H_ +#define _OCL_Perf_DeviceConcurrency_H_ + +#include "OCLTestImp.h" + +class OCLPerfDeviceConcurrency : public OCLTestImp { + public: + OCLPerfDeviceConcurrency(); + virtual ~OCLPerfDeviceConcurrency(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void setData(cl_mem buffer, unsigned int idx, unsigned int data); + void checkData(cl_mem buffer, unsigned int idx); + +#define MAX_DEVICES 16 + + cl_context context_; + cl_command_queue cmd_queue_[MAX_DEVICES]; + cl_program program_[MAX_DEVICES]; + cl_kernel kernel_[MAX_DEVICES]; + cl_mem outBuffer_[MAX_DEVICES]; + cl_int error_; + + cl_uint num_devices; + cl_uint cur_devices; + + unsigned int width_; + unsigned int bufSize_; + unsigned int maxIter; + unsigned int coordIdx; + unsigned long long totalIters; +}; + +#endif // _OCL_Perf_DeviceConcurrency_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue.cpp new file mode 100644 index 0000000000..3efecf5ff1 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue.cpp @@ -0,0 +1,227 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfDeviceEnqueue.h" + +#include +#include +#include +#include + +#include "CL/cl.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define KERNEL_CODE(...) #__VA_ARGS__ + +typedef struct { + unsigned int threads; +} testStruct; + +static testStruct testList[] = { + {64}, {128}, {256}, {512}, {1024}, {2048}, {4096}, +}; + +const static char* strKernel = {KERNEL_CODE( + \n __kernel void childKernel(__global uint* buf) { + int idx = get_global_id(0); + if (idx < 0) { + buf[idx] = 0; + } +} + \n __kernel void parentKernel(__global uint* buf) { + queue_t def_q = get_default_queue(); + ndrange_t ndrange = ndrange_1D(64, 64); + int gid = get_global_id(0); + + int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, ^{ + childKernel(buf); + }); +} + \n)}; + +OCLPerfDeviceEnqueue::OCLPerfDeviceEnqueue() { + testListSize = sizeof(testList) / sizeof(testStruct); + _numSubTests = 7 * testListSize; + deviceQueue_ = NULL; + failed_ = false; + kernel2_ = NULL; +} + +OCLPerfDeviceEnqueue::~OCLPerfDeviceEnqueue() {} + +void OCLPerfDeviceEnqueue::open(unsigned int test, char* units, + double& conversion, unsigned int deviceId) { + if (type_ == CL_DEVICE_TYPE_CPU) { + return; + } + + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + testID_ = test; + + threads = testList[testID_ % testListSize].threads; + size_t param_size = 0; + char* strVersion = 0; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0, + 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strVersion = new char[param_size]; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, + param_size, strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (strVersion[7] < '2') { + failed_ = true; + return; + } + delete strVersion; + cl_uint maxDevQSize = 0; +#if defined(CL_VERSION_2_0) + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], + CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, + sizeof(cl_uint), &maxDevQSize, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); +#endif + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "parentKernel", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + kernel2_ = _wrapper->clCreateKernel(program_, "childKernel", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + cl_mem buffer; + + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR, 2048, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + + // Hardcoded for us + if (testID_ >= testListSize) { + queueSize = (1 << (testID_ / testListSize)) * 256 * 1024; + queueSize = std::min(queueSize, maxDevQSize); + threads *= (1 << (testID_ / testListSize - 1)); + threads = std::min(threads, queueSize / 128); + } else { + queueSize = std::max((cl_uint)threads * 128, (cl_uint)16384); + } + +#if defined(CL_VERSION_2_0) + const cl_queue_properties cprops[] = { + CL_QUEUE_PROPERTIES, + static_cast(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | + CL_QUEUE_ON_DEVICE_DEFAULT | + CL_QUEUE_ON_DEVICE), + CL_QUEUE_SIZE, queueSize, 0}; + deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties( + context_, devices_[deviceId], cprops, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateCommandQueueWithProperties() failed"); +#endif +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLPerfDeviceEnqueue::run(void) { + CPerfCounter timer; + if (type_ == CL_DEVICE_TYPE_CPU) { + return; + } + + if (failed_) return; + + cl_mem buffer = buffers()[0]; + + size_t gws[1] = {threads}; + size_t lws[1] = {64}; + + if (gws[0] >= 256) { + lws[0] = 256; + } + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + _wrapper->clFinish(cmdQueues_[_deviceId]); + + // Try to normalize the amount of work per test + unsigned int repeats = (64 / threads) * 50; + if (repeats == 0) repeats = 1; + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < repeats; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + _wrapper->clFinish(cmdQueues_[_deviceId]); + } + timer.Stop(); + + double sec = timer.GetElapsedTime(); + + _perfInfo = (float)(threads * repeats) / (float)(sec * 1000000.); + char buf[256]; + SNPRINTF(buf, sizeof(buf), + "%7d threads spawning 64 threads, queue size %5dKB (Mdisp/s)", + threads, queueSize / 1024); + testDescString = buf; +} + +unsigned int OCLPerfDeviceEnqueue::close(void) { + // FIXME: Re-enable CPU test once bug 10143 is fixed. + if (type_ == CL_DEVICE_TYPE_CPU) { + return 0; + } + + if (NULL != deviceQueue_) { + _wrapper->clReleaseCommandQueue(deviceQueue_); + } + if (NULL != kernel2_) { + _wrapper->clReleaseKernel(kernel2_); + } + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue.h new file mode 100644 index 0000000000..c1a033fb48 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCLPERF_DEVICE_ENQUEUE_H_ +#define _OCLPERF_DEVICE_ENQUEUE_H_ + +#include "OCLTestImp.h" + +class OCLPerfDeviceEnqueue : public OCLTestImp { + public: + OCLPerfDeviceEnqueue(); + virtual ~OCLPerfDeviceEnqueue(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + cl_command_queue deviceQueue_; + bool failed_; + unsigned int testID_; + cl_kernel kernel2_; + unsigned int testListSize; + unsigned int threads; + cl_uint queueSize; +}; + +#endif // _OCLPERF_DEVICE_ENQUEUE_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue2.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue2.cpp new file mode 100644 index 0000000000..67835dc173 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue2.cpp @@ -0,0 +1,260 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfDeviceEnqueue2.h" + +#include +#include +#include +#include + +#include "CL/cl.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define KERNEL_CODE(...) #__VA_ARGS__ + +typedef struct { + unsigned int threads; +} testStruct; + +static testStruct testList[] = { + {64}, {128}, {256}, {512}, {1024}, {2048}, {4096}, +}; + +static unsigned int qsizeList[] = { + 16, 32, 64, 128, 256, 512, +}; + +static unsigned int levelList[] = { + 1, + 2, + 4, + 8, +}; + +const static char* strKernel = {KERNEL_CODE( + \n __kernel void childKernel(__global uint* buf, uint level) { + if (level) { + queue_t def_q = get_default_queue(); + ndrange_t ndrange = ndrange_1D(64, 64); + int gid = get_global_id(0); + int lid = get_local_id(0); + if (lid == 0) { + int enq_res = + enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, ^{ + childKernel(buf, level - 1); + }); + } + } else { + int idx = get_global_id(0); + if (idx < 0) { + buf[idx] = 0; + } + } +} + \n __kernel void parentKernel(__global uint* buf, uint level) { + queue_t def_q = get_default_queue(); + ndrange_t ndrange = ndrange_1D(64, 64); + int gid = get_global_id(0); + + if (level) { + int enq_res = + enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, ^{ + childKernel(buf, level - 1); + }); + } +} + \n)}; + +OCLPerfDeviceEnqueue2::OCLPerfDeviceEnqueue2() { + subTests_level = sizeof(levelList) / sizeof(unsigned int); + subTests_qsize = (sizeof(qsizeList) / sizeof(unsigned int)); + subTests_thread = sizeof(testList) / sizeof(testStruct); + testListSize = subTests_thread; + _numSubTests = subTests_level * subTests_qsize * subTests_thread; + deviceQueue_ = NULL; + failed_ = false; + kernel2_ = NULL; + level = 2; + skip_ = false; +} + +OCLPerfDeviceEnqueue2::~OCLPerfDeviceEnqueue2() {} + +void OCLPerfDeviceEnqueue2::open(unsigned int test, char* units, + double& conversion, unsigned int deviceId) { + if (type_ == CL_DEVICE_TYPE_CPU) { + return; + } + + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + testID_ = test; + + threads = testList[testID_ / (subTests_qsize * subTests_level)].threads; + queueSize = qsizeList[(testID_ / subTests_level) % subTests_qsize] * 1024; + level = levelList[testID_ % subTests_level]; + + size_t param_size = 0; + char* strVersion = 0; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0, + 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strVersion = new char[param_size]; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, + param_size, strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (strVersion[7] < '2') { + failed_ = true; + return; + } + delete strVersion; + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "parentKernel", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + kernel2_ = _wrapper->clCreateKernel(program_, "childKernel", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + cl_mem buffer; + + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR, 2048, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + +#if defined(CL_VERSION_2_0) + const cl_queue_properties cprops[] = { + CL_QUEUE_PROPERTIES, + static_cast(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | + CL_QUEUE_ON_DEVICE_DEFAULT | + CL_QUEUE_ON_DEVICE), + CL_QUEUE_SIZE, queueSize, 0}; + deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties( + context_, devices_[deviceId], cprops, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateCommandQueueWithProperties() failed"); +#else + skip_ = true; + testDescString = + "DeviceEnqueue NOT supported for < 2.0 builds. Test Skipped."; + return; +#endif +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLPerfDeviceEnqueue2::run(void) { + CPerfCounter timer; + if (type_ == CL_DEVICE_TYPE_CPU) { + return; + } + + if (failed_) { + return; + } + + if (skip_) { + return; + } + + cl_mem buffer = buffers()[0]; + + size_t gws[1] = {threads}; + size_t lws[1] = {64}; + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(unsigned int), &level); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + _wrapper->clFinish(cmdQueues_[_deviceId]); + + // Try to normalize the amount of work per test + // unsigned int repeats = (4096 / threads) * 100 ; + unsigned int repeats = (4096 / threads) * 10; + // unsigned int repeats = 100; + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < repeats; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + _wrapper->clFinish(cmdQueues_[_deviceId]); + } + timer.Stop(); + + double sec = timer.GetElapsedTime(); + + _perfInfo = (float)(threads * repeats * level) / (float)(sec * 1000000.); + char buf[256]; + SNPRINTF( + buf, sizeof(buf), + "%5d threads spawning 64 threads, queue size %3dKB (Mdisp/s), level=%2d", + threads, queueSize / 1024, level); + testDescString = buf; +} + +unsigned int OCLPerfDeviceEnqueue2::close(void) { + // FIXME: Re-enable CPU test once bug 10143 is fixed. + if (type_ == CL_DEVICE_TYPE_CPU) { + return 0; + } + + if (deviceQueue_) { + error_ = _wrapper->clReleaseCommandQueue(deviceQueue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (kernel2_) { + error_ = _wrapper->clReleaseKernel(kernel2_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue2.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue2.h new file mode 100644 index 0000000000..2a4bde8ced --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueue2.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCLPERF_DEVICE_ENQUEUE2_H_ +#define _OCLPERF_DEVICE_ENQUEUE2_H_ + +#include "OCLTestImp.h" + +class OCLPerfDeviceEnqueue2 : public OCLTestImp { + public: + OCLPerfDeviceEnqueue2(); + virtual ~OCLPerfDeviceEnqueue2(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + cl_command_queue deviceQueue_; + unsigned int testID_; + cl_kernel kernel2_; + unsigned int testListSize; + unsigned int threads; + cl_uint queueSize; + unsigned int subTests_level; + unsigned int subTests_qsize; + unsigned int subTests_thread; + unsigned int level; + unsigned int lws_value; + + bool failed_; + bool skip_; +}; + +#endif // _OCLPERF_DEVICE_ENQUEUE2_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueEvent.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueEvent.cpp new file mode 100644 index 0000000000..6fa7dcab50 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueEvent.cpp @@ -0,0 +1,267 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfDeviceEnqueueEvent.h" + +#include +#include +#include +#include + +#include "CL/cl.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define KERNEL_CODE(...) #__VA_ARGS__ + +typedef struct { + unsigned int threads; +} testStruct; + +static testStruct testList[] = { + {64}, {128}, {256}, {512}, {1024}, {2048}, {4096}, +}; + +static unsigned int qsizeList[] = { + 16, 32, 64, 128, 256, 512, +}; + +static unsigned int levelList[] = { + 1, + 2, + 4, + 8, +}; + +const static char* strKernel = {KERNEL_CODE( + \n __kernel void childKernel(__global uint* buf, uint level, + clk_event_t wait_evt) { + int idx = get_global_id(0); + if (idx < 0) { + buf[idx] = 0; + } +} + \n __kernel void parentKernel(__global uint* buf, uint level) { + if (level) { + queue_t def_q = get_default_queue(); + ndrange_t ndrange = ndrange_1D(64, 64); + clk_event_t user_evt = create_user_event(); + clk_event_t block_evt, wait_evt; + wait_evt = user_evt; + + for (uint i = 0; i < level; i++) { + int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 0, + /*&user_evt*/ NULL, &block_evt, ^{ + childKernel(buf, level - 1, block_evt); + }); + + // wait_evt = block_evt; + } + if (is_valid_event(user_evt)) { + set_user_event_status(user_evt, CL_COMPLETE); + release_event(user_evt); + } + } else { + int idx = get_global_id(0); + if (idx < 0) { + buf[idx] = 0; + } + } +} + \n)}; + +OCLPerfDeviceEnqueueEvent::OCLPerfDeviceEnqueueEvent() { + subTests_level = sizeof(levelList) / sizeof(unsigned int); + subTests_qsize = (sizeof(qsizeList) / sizeof(unsigned int)); + subTests_thread = sizeof(testList) / sizeof(testStruct); + testListSize = subTests_thread; + //_numSubTests = 2*testListSize + subTests_level + subTests_qsize; + _numSubTests = subTests_level * subTests_qsize * subTests_thread; + deviceQueue_ = NULL; + failed_ = false; + skip_ = false; + kernel2_ = NULL; + level = 2; +} + +OCLPerfDeviceEnqueueEvent::~OCLPerfDeviceEnqueueEvent() {} + +void OCLPerfDeviceEnqueueEvent::open(unsigned int test, char* units, + double& conversion, + unsigned int deviceId) { + if (type_ == CL_DEVICE_TYPE_CPU) { + return; + } + + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + testID_ = test; + + threads = testList[testID_ / (subTests_qsize * subTests_level)].threads; + queueSize = qsizeList[(testID_ / subTests_level) % subTests_qsize] * 1024; + level = levelList[testID_ % subTests_level]; + + lws_value = 64; + + size_t param_size = 0; + char* strVersion = 0; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0, + 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strVersion = new char[param_size]; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, + param_size, strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (strVersion[7] < '2') { + failed_ = true; + return; + } + delete strVersion; + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "parentKernel", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + kernel2_ = _wrapper->clCreateKernel(program_, "childKernel", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + cl_mem buffer; + + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR, 2048, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + +#if defined(CL_VERSION_2_0) + const cl_queue_properties cprops[] = { + CL_QUEUE_PROPERTIES, + static_cast(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | + CL_QUEUE_ON_DEVICE_DEFAULT | + CL_QUEUE_ON_DEVICE), + CL_QUEUE_SIZE, queueSize, 0}; + deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties( + context_, devices_[deviceId], cprops, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateCommandQueueWithProperties() failed"); +#else + skip_ = true; + testDescString = + "DeviceEnqueue NOT supported for < 2.0 builds. Test Skipped."; + return; +#endif +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLPerfDeviceEnqueueEvent::run(void) { + CPerfCounter timer; + if (type_ == CL_DEVICE_TYPE_CPU) { + return; + } + + if (failed_) { + return; + } + + if (skip_) { + return; + } + + cl_mem buffer = buffers()[0]; + + size_t gws[1] = {threads}; + size_t lws[1] = {lws_value}; + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(unsigned int), &level); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + _wrapper->clFinish(cmdQueues_[_deviceId]); + + // Try to normalize the amount of work per test + // unsigned int repeats = (4096 / threads) * 100 ; + unsigned int repeats = (4096 / threads) * 10; + // unsigned int repeats = 100; + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < repeats; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + _wrapper->clFinish(cmdQueues_[_deviceId]); + } + timer.Stop(); + + double sec = timer.GetElapsedTime(); + + _perfInfo = (float)(threads * repeats * level) / (float)(sec * 1000000.); + char buf[256]; + SNPRINTF( + buf, sizeof(buf), + "%5d threads spawning %2d threads, queue size %3dKB (Mdisp/s), level=%2d", + threads, lws_value, queueSize / 1024, level); + testDescString = buf; +} + +unsigned int OCLPerfDeviceEnqueueEvent::close(void) { + // FIXME: Re-enable CPU test once bug 10143 is fixed. + if (type_ == CL_DEVICE_TYPE_CPU) { + return 0; + } + + if (deviceQueue_) { + error_ = _wrapper->clReleaseCommandQueue(deviceQueue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (kernel2_) { + error_ = _wrapper->clReleaseKernel(kernel2_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueEvent.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueEvent.h new file mode 100644 index 0000000000..f7c37c3f51 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueEvent.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCLPERF_DEVICE_ENQUEUE_EVENT_H_ +#define _OCLPERF_DEVICE_ENQUEUE_EVENT_H_ + +#include "OCLTestImp.h" + +class OCLPerfDeviceEnqueueEvent : public OCLTestImp { + public: + OCLPerfDeviceEnqueueEvent(); + virtual ~OCLPerfDeviceEnqueueEvent(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + cl_command_queue deviceQueue_; + unsigned int testID_; + cl_kernel kernel2_; + unsigned int testListSize; + unsigned int threads; + cl_uint queueSize; + unsigned int subTests_level; + unsigned int subTests_qsize; + unsigned int subTests_thread; + unsigned int level; + unsigned int lws_value; + + bool failed_; + bool skip_; +}; + +#endif // _OCLPERF_DEVICE_ENQUEUE_EVENT_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueSier.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueSier.cpp new file mode 100644 index 0000000000..da048933f8 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueSier.cpp @@ -0,0 +1,233 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfDeviceEnqueueSier.h" + +#include +#include +#include +#include +#include + +#include "CL/cl.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define KERNEL_CODE(...) #__VA_ARGS__ + +typedef struct { + unsigned int threads; +} testStruct; + +static unsigned int sizeList[] = { + 81, 243, 729, 2187, 6561, 19683, 59049, +}; + +const static char* strKernel = {KERNEL_CODE( + \n __kernel void parentKernel(__global uint* buf, int width, int offsetx, + int offsety) { + int x = get_global_id(0); + int y = get_global_id(1); + queue_t q = get_default_queue(); + + int one_third = get_global_size(0) / 3; + int two_thirds = 2 * one_third; + + if (x >= one_third && x < two_thirds && y >= one_third && y < two_thirds) { + int idx = get_global_id(0); + if (idx < 0) { + buf[idx] = 0; + } + } else { + if (one_third > 1 && x % one_third == 0 && y % one_third == 0) { + const size_t grid[2] = {one_third, one_third}; + enqueue_kernel(q, 0, ndrange_2D(grid), ^{ + parentKernel(buf, width, x + offsetx, y + offsety); + }); + } + } +} + \n)}; + +OCLPerfDeviceEnqueueSier::OCLPerfDeviceEnqueueSier() { + _numSubTests = sizeof(sizeList) / sizeof(unsigned int); + deviceQueue_ = NULL; + failed_ = false; + skip_ = false; +} + +OCLPerfDeviceEnqueueSier::~OCLPerfDeviceEnqueueSier() {} + +void OCLPerfDeviceEnqueueSier::open(unsigned int test, char* units, + double& conversion, unsigned int deviceId) { + if (type_ == CL_DEVICE_TYPE_CPU) { + return; + } + + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + testID_ = test; + + size_t param_size = 0; + char* strVersion = 0; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0, + 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strVersion = new char[param_size]; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, + param_size, strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (strVersion[7] < '2') { + failed_ = true; + return; + } + delete strVersion; + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "parentKernel", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + cl_mem buffer; + + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR, 2048, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + + queueSize = 512 * 1024; + + image_size = sizeList[testID_]; + +#if defined(CL_VERSION_2_0) + const cl_queue_properties cprops[] = { + CL_QUEUE_PROPERTIES, + static_cast(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | + CL_QUEUE_ON_DEVICE_DEFAULT | + CL_QUEUE_ON_DEVICE), + CL_QUEUE_SIZE, queueSize, 0}; + deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties( + context_, devices_[deviceId], cprops, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateCommandQueueWithProperties() failed"); +#else + skip_ = true; + testDescString = + "DeviceEnqueue NOT supported for < 2.0 builds. Test Skipped."; + return; +#endif +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLPerfDeviceEnqueueSier::run(void) { + CPerfCounter timer; + if (type_ == CL_DEVICE_TYPE_CPU) { + return; + } + + if (failed_) { + return; + } + + if (skip_) { + return; + } + + cl_mem buffer = buffers()[0]; + + size_t gws[1] = {1}; + size_t lws[1] = {0}; + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + int width = image_size, offsetx = 0, offsety = 0; + error_ |= _wrapper->clSetKernelArg(kernel_, 1, sizeof(int), (void*)&width); + error_ |= _wrapper->clSetKernelArg(kernel_, 2, sizeof(int), (void*)&offsetx); + error_ |= _wrapper->clSetKernelArg(kernel_, 3, sizeof(int), (void*)&offsety); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, 0, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + _wrapper->clFinish(cmdQueues_[_deviceId]); + + size_t global_work_size[2] = {image_size, image_size}; + + // Try to normalize the amount of work per test + unsigned int repeats = 100; + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < repeats; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, + NULL, global_work_size, 0, 0, + NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + _wrapper->clFinish(cmdQueues_[_deviceId]); + } + timer.Stop(); + + double sec = timer.GetElapsedTime(); + + unsigned int numOfKernels = (int)pow(8.0, log(image_size) / log(3) - 1); + _perfInfo = (float)(numOfKernels * repeats) / (float)(sec * 1000000.); + char buf[256]; + SNPRINTF(buf, sizeof(buf), "image_size = %5d, queue size %3dKB (Mdisp/s)", + image_size, queueSize / 1024); + testDescString = buf; +} + +unsigned int OCLPerfDeviceEnqueueSier::close(void) { + // FIXME: Re-enable CPU test once bug 10143 is fixed. + if (type_ == CL_DEVICE_TYPE_CPU) { + return 0; + } + + if (deviceQueue_) { + error_ = _wrapper->clReleaseCommandQueue(deviceQueue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueSier.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueSier.h new file mode 100644 index 0000000000..dc4f5132cd --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDeviceEnqueueSier.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCLPERF_DEVICE_ENQUEUE_SIER_H_ +#define _OCLPERF_DEVICE_ENQUEUE_SIER_H_ + +#include "OCLTestImp.h" + +class OCLPerfDeviceEnqueueSier : public OCLTestImp { + public: + OCLPerfDeviceEnqueueSier(); + virtual ~OCLPerfDeviceEnqueueSier(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + cl_command_queue deviceQueue_; + unsigned int testID_; + unsigned int testListSize; + // unsigned int threads; + cl_uint queueSize; + unsigned int image_size; + + bool failed_; + bool skip_; +}; + +#endif // _OCLPERF_DEVICE_ENQUEUE_SIER_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDispatchSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDispatchSpeed.cpp new file mode 100644 index 0000000000..8ebef5c33c --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDispatchSpeed.cpp @@ -0,0 +1,391 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfDispatchSpeed.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define CHAR_BUF_SIZE 512 + +typedef struct { + unsigned int iterations; + int flushEvery; +} testStruct; + +testStruct testList[] = { + {1, -1}, {1, -1}, {10, 1}, {10, -1}, {100, 1}, + {100, 10}, {100, -1}, {1000, 1}, {1000, 10}, {1000, 100}, + {1000, -1}, {10000, 1}, {10000, 10}, {10000, 100}, {10000, 1000}, + {10000, -1}, {100000, 1}, {100000, 10}, {100000, 100}, {100000, 1000}, + {100000, 10000}, {100000, -1}, +}; + +unsigned int mapTestList[] = {1, 1, 10, 100, 1000, 10000, 100000}; + +void OCLPerfDispatchSpeed::genShader(void) { + shader_.clear(); + shader_ += + "__kernel void _dispatchSpeed(__global float *outBuf)\n" + "{\n" + " int i = (int) get_global_id(0);\n" + " if (i < 0)\n" + " outBuf[i] = 0.0f;\n" + "}\n"; +} + +OCLPerfDispatchSpeed::OCLPerfDispatchSpeed() { + testListSize = sizeof(testList) / sizeof(testStruct); + _numSubTests = 2 * 2 * testListSize; +} + +OCLPerfDispatchSpeed::~OCLPerfDispatchSpeed() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfDispatchSpeed::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test % testListSize; + + context_ = 0; + cmd_queue_ = 0; + program_ = 0; + kernel_ = 0; + outBuffer_ = 0; + sleep = false; + doWarmup = false; + + if ((test / testListSize) % 2) { + doWarmup = true; + } + if (test >= (testListSize * 2)) { + sleep = true; + } + + bufSize_ = 64 * sizeof(cl_float); + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + // if (num_devices > 0) + //{ + // platform = platforms[_platformIndex]; + // break; + //} +#if 0 + } +#endif + delete platforms; + } else { + CHECK_RESULT(numPlatforms == 0, "No platforms available!"); + } + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + genShader(); + char *tmp = (char *)shader_.c_str(); + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&tmp, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &device, "", NULL, NULL); + + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "_dispatchSpeed", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer_); +} + +void OCLPerfDispatchSpeed::run(void) { + int global = bufSize_ / sizeof(cl_float); + int local = 64; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + + CPerfCounter timer; + cl_event event; + cl_int eventStatus; + + if (doWarmup) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, &event); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + _wrapper->clFinish(cmd_queue_); + } + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < testList[_openTest].iterations; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, &event); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + if ((testList[_openTest].flushEvery > 0) && + (((i + 1) % testList[_openTest].flushEvery) == 0)) { + if (sleep) { + _wrapper->clFinish(cmd_queue_); + } else { + _wrapper->clFlush(cmd_queue_); + error_ = + _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(cl_int), &eventStatus, NULL); + while (eventStatus > 0) { + error_ = + _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(cl_int), &eventStatus, NULL); + } + } + } + if (i != (testList[_openTest].iterations - 1)) { + _wrapper->clReleaseEvent(event); + } + } + if (sleep) { + _wrapper->clFinish(cmd_queue_); + } else { + _wrapper->clFlush(cmd_queue_); + error_ = _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(cl_int), &eventStatus, NULL); + while (eventStatus > 0) { + error_ = + _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(cl_int), &eventStatus, NULL); + } + } + _wrapper->clReleaseEvent(event); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // microseconds per launch + double perf = (1000000.f * sec / testList[_openTest].iterations); + const char *waitType; + const char *extraChar; + const char *n; + const char *warmup; + if (sleep) { + waitType = "sleep"; + extraChar = ""; + n = ""; + } else { + waitType = "spin"; + n = "n"; + extraChar = " "; + } + if (doWarmup) { + warmup = "warmup"; + } else { + warmup = ""; + } + + _perfInfo = (float)perf; + char buf[256]; + if (testList[_openTest].flushEvery > 0) { + SNPRINTF(buf, sizeof(buf), + " %7d dispatches %s%sing every %5d %6s (us/disp)", + testList[_openTest].iterations, waitType, n, + testList[_openTest].flushEvery, warmup); + } else { + SNPRINTF(buf, sizeof(buf), + " %7d dispatches (%s%s) %6s (us/disp)", + testList[_openTest].iterations, waitType, extraChar, warmup); + } + testDescString = buf; +} + +unsigned int OCLPerfDispatchSpeed::close(void) { + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} + +OCLPerfMapDispatchSpeed::OCLPerfMapDispatchSpeed() { + testListSize = sizeof(mapTestList) / sizeof(unsigned int); + _numSubTests = 2 * testListSize; +} + +void OCLPerfMapDispatchSpeed::run(void) { + cl_mem outBuffer; + outBuffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR, + bufSize_, NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer); + + int global = bufSize_ / sizeof(cl_float); + int local = 64; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + + CPerfCounter timer; + + if (doWarmup) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + _wrapper->clFinish(cmd_queue_); + } + + timer.Reset(); + timer.Start(); + void *mem; + for (unsigned int i = 0; i < mapTestList[_openTest]; i++) { + mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer, CL_TRUE, + CL_MAP_WRITE_INVALIDATE_REGION, 0, + bufSize_, 0, NULL, NULL, &error_); + + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer, mem, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed"); + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + } + _wrapper->clFinish(cmd_queue_); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // microseconds per launch + double perf = (1000000.f * sec / mapTestList[_openTest]); + const char *warmup; + if (doWarmup) { + warmup = "warmup"; + } else { + warmup = ""; + } + + _perfInfo = (float)perf; + char buf[256]; + SNPRINTF(buf, sizeof(buf), " %7d maps and dispatches %6s (us/disp)", + mapTestList[_openTest], warmup); + testDescString = buf; + + _wrapper->clReleaseMemObject(outBuffer); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDispatchSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDispatchSpeed.h new file mode 100644 index 0000000000..2dfc7bd70b --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDispatchSpeed.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_DispatchSpeed_H_ +#define _OCL_DispatchSpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerfDispatchSpeed : public OCLTestImp { + public: + OCLPerfDispatchSpeed(); + virtual ~OCLPerfDispatchSpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void genShader(void); + + cl_context context_; + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + cl_mem outBuffer_; + cl_int error_; + bool doWarmup; + + unsigned int bufSize_; + bool sleep; + unsigned int testListSize; +}; + +class OCLPerfMapDispatchSpeed : public OCLPerfDispatchSpeed { + public: + OCLPerfMapDispatchSpeed(); + virtual void run(void); +}; +#endif // _OCL_DispatchSpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMA.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMA.cpp new file mode 100644 index 0000000000..6315e1a151 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMA.cpp @@ -0,0 +1,442 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfDoubleDMA.h" + +#include +#include +#include + +#include +#include +#include + +#include "CL/cl.h" +#include "CL/cl_ext.h" + +const size_t blockX = 256; +const size_t blockY = 256; +const size_t blockZ = 512; + +const size_t chunk = 16; +const size_t size_S = blockX * blockY * blockZ * sizeof(cl_float4); +const size_t size_s = blockX * blockY * chunk * sizeof(cl_float4); +static const int WindowWidth = 80; + +const size_t MaxQueues = 3; +bool profEnable = false; + +static const char* strKernel = + "__kernel void dummy(__global float4* out) \n" + "{ \n" + " uint id = get_global_id(0); \n" + " float4 value = (float4)(1.0f, 2.0f, 3.0f, 4.0f); \n" + " uint factorial = 1; \n" + " for (uint i = 1; i < (id / 0x400); ++i)\n" + " { \n" + " factorial *= i; \n" + " } \n" + " out[id] = value * factorial; \n" + "} \n"; + +class ProfileQueue { + public: + enum Operation { Write = 0, Execute, Read, Total }; + + static const char* OperationName[Total]; + static const char StartCommand[Total]; + static const char ExecCommand[Total]; + + ProfileQueue() {} + ~ProfileQueue() { + for (size_t op = 0; op < Total; ++op) { + for (size_t idx = 0; idx < events_[op].size(); ++idx) { + clReleaseEvent(events_[op][idx]); + } + } + } + + void addEvent(Operation op, cl_event event) { events_[op].push_back(event); } + + void findMinMax(cl_long* min, cl_long* max) { + // Find time min/max ranges for the frame scaling + for (size_t op = 0; (op < ProfileQueue::Total); ++op) { + cl_long time; + if (events_[op].size() == 0) continue; + clGetEventProfilingInfo(events_[op][0], CL_PROFILING_COMMAND_START, + sizeof(cl_long), &time, NULL); + if (0 == *min) { + *min = time; + } else { + *min = std::min(*min, time); + } + clGetEventProfilingInfo(events_[op][events_[op].size() - 1], + CL_PROFILING_COMMAND_END, sizeof(cl_long), &time, + NULL); + if (0 == *max) { + *max = time; + } else { + *max = std::max(*max, time); + } + } + } + + void display(cl_long start, cl_long finish) { + std::string graph; + graph.resize(WindowWidth + 1); + graph[WindowWidth] = '\x0'; + cl_long timeFrame = finish - start; + cl_long interval = timeFrame / WindowWidth; + + // Find time min/max ranges for the frame scaling + for (size_t op = 0; (op < Total); ++op) { + if (events_[op].size() == 0) continue; + cl_long timeStart, timeEnd; + int begin = 0, end = 0; + for (size_t idx = 0; idx < events_[op].size(); ++idx) { + bool cutStart = false; + clGetEventProfilingInfo(events_[op][idx], CL_PROFILING_COMMAND_START, + sizeof(cl_long), &timeStart, NULL); + clGetEventProfilingInfo(events_[op][idx], CL_PROFILING_COMMAND_END, + sizeof(cl_long), &timeEnd, NULL); + + // Continue if out of the frame scope + if (timeStart >= finish) continue; + if (timeEnd <= start) continue; + + if (timeStart <= start) { + timeStart = start; + cutStart = true; + } + + if (timeEnd >= finish) { + timeEnd = finish; + } + + // Readjust time to the frame + timeStart -= start; + timeEnd -= start; + timeStart = static_cast( + floor(static_cast(timeStart) / interval + 0.5f)); + timeEnd = static_cast( + floor(static_cast(timeEnd) / interval + 0.5f)); + begin = static_cast(timeStart); + // Idle from end to begin + for (int c = end; c < begin; ++c) { + graph[c] = '-'; + } + end = static_cast(timeEnd); + for (int c = begin; c < end; ++c) { + if ((c == begin) && !cutStart) { + graph[c] = StartCommand[op]; + } else { + graph[c] = ExecCommand[op]; + } + } + if ((begin == end) && (end < WindowWidth)) { + graph[begin] = '+'; + } + } + if (end < WindowWidth) { + for (int c = end; c < WindowWidth; ++c) { + graph[c] = '-'; + } + } + printf("%s\n", graph.c_str()); + } + } + + private: + // Profiling events + std::vector events_[Total]; +}; + +const char* ProfileQueue::OperationName[Total] = { + "BufferWrite", "KernelExecution", "BufferRead"}; +const char ProfileQueue::StartCommand[Total] = {'W', 'X', 'R'}; +const char ProfileQueue::ExecCommand[Total] = {'>', '#', '<'}; + +class Profile { + public: + Profile(bool profEna, int numQueues) + : profileEna_(profEna), + numQueues_(numQueues), + min_(0), + max_(0), + execTime_(0) {} + + ~Profile() {} + + void addEvent(int queue, ProfileQueue::Operation op, cl_event event) { + if (profileEna_) { + profQueue[queue].addEvent(op, event); + } + } + + cl_long findExecTime() { + if (execTime_ != 0) return execTime_; + for (int q = 0; q < numQueues_; ++q) { + profQueue[q].findMinMax(&min_, &max_); + } + execTime_ = max_ - min_; + return execTime_; + } + + void display(cl_long start, cl_long finish) { + if (!profileEna_) return; + printf("\n ----------- Time frame %.3f (us), scale 1:%.0f\n", + (float)(finish - start) / 1000, + (float)(finish - start) / (1000 * WindowWidth)); + for (size_t op = 0; (op < ProfileQueue::Total); ++op) { + printf("%s - %c%c; ", ProfileQueue::OperationName[op], + ProfileQueue::StartCommand[op], ProfileQueue::ExecCommand[op]); + } + printf("\n"); + for (int q = 0; q < numQueues_; ++q) { + printf("CommandQueue #%d\n", q); + profQueue[q].display(min_ + start, min_ + finish); + } + } + + private: + bool profileEna_; + int numQueues_; //!< Total number of queues + cl_long min_; //!< Min HW timestamp + cl_long max_; //!< Max HW timestamp + cl_long execTime_; //!< Profile time + ProfileQueue profQueue[MaxQueues]; +}; + +OCLPerfDoubleDMA::OCLPerfDoubleDMA() { + _numSubTests = 2 * MaxQueues * 2; + failed_ = false; +} + +OCLPerfDoubleDMA::~OCLPerfDoubleDMA() {} + +void OCLPerfDoubleDMA::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + _deviceId = deviceId; + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + test_ = test; + cl_device_type deviceType; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE, + sizeof(deviceType), &deviceType, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed"); + + if (!(deviceType & CL_DEVICE_TYPE_GPU)) { + printf("GPU device is required for this test!\n"); + failed_ = true; + return; + } + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + kernel_ = _wrapper->clCreateKernel(program_, "dummy", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + size_t bufSize = size_s; + cl_mem buffer; + if (test_ >= (2 * MaxQueues)) { + profEnable = true; + } + test_ %= 2 * MaxQueues; + size_t numBufs = (test_ % MaxQueues) + 1; + for (size_t b = 0; b < numBufs; ++b) { + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, bufSize, + NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + } + + buffer = _wrapper->clCreateBuffer(context_, + CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, + size_S, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLPerfDoubleDMA::run(void) { + if (failed_) { + return; + } + CPerfCounter timer; + const int numQueues = (test_ % MaxQueues) + 1; + const bool useKernel = ((test_ / MaxQueues) > 0); + const int numBufs = numQueues; + Profile profile(profEnable, numQueues); + + std::vector cmdQueues(numQueues); + int q; + cl_command_queue_properties qProp = + (profEnable) ? CL_QUEUE_PROFILING_ENABLE : 0; + for (q = 0; q < numQueues; ++q) { + cl_command_queue cmdQueue = _wrapper->clCreateCommandQueue( + context_, devices_[_deviceId], qProp, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed"); + cmdQueues[q] = cmdQueue; + } + + float* Data_s = (float*)_wrapper->clEnqueueMapBuffer( + cmdQueues[0], buffers_[numBufs], CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, + size_S, 0, NULL, NULL, &error_); + + size_t gws[1] = {size_s / (4 * sizeof(float))}; + size_t lws[1] = {256}; + + // Warm-up + for (q = 0; q < numQueues; ++q) { + error_ |= + _wrapper->clEnqueueWriteBuffer(cmdQueues[q], buffers_[q], CL_FALSE, 0, + size_s, (char*)Data_s, 0, NULL, NULL); + error_ |= _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), + (void*)&buffers_[q]); + error_ |= _wrapper->clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL, + gws, lws, 0, NULL, NULL); + error_ |= + _wrapper->clEnqueueReadBuffer(cmdQueues[q], buffers_[q], CL_FALSE, 0, + size_s, (char*)Data_s, 0, NULL, NULL); + error_ |= _wrapper->clFinish(cmdQueues[q]); + } + + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "Execution failed"); + + size_t s_done = 0; + cl_event r[MaxQueues] = {0}, w[MaxQueues] = {0}, x[MaxQueues] = {0}; + + /*---------- pass2: copy Data_s to and from GPU Buffers ----------*/ + s_done = 0; + timer.Reset(); + timer.Start(); + int idx = numBufs - 1; + // Start from the last so read/write won't go to the same DMA when kernel is + // executed + q = numQueues - 1; + size_t iter = 0; + while (1) { + if (0 == r[idx]) { + error_ |= _wrapper->clEnqueueWriteBuffer( + cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s, + (char*)Data_s + s_done, 0, NULL, &w[idx]); + } else { + error_ |= _wrapper->clEnqueueWriteBuffer( + cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s, + (char*)Data_s + s_done, 1, &r[idx], &w[idx]); + if (!profEnable) { + error_ |= _wrapper->clReleaseEvent(r[idx]); + } + } + _wrapper->clFlush(cmdQueues[q]); + profile.addEvent(q, ProfileQueue::Write, w[idx]); + + if (useKernel) { + // Change the queue + ++q %= numQueues; + // Implicit flush of DMA engine on kernel start, because memory dependency + error_ |= _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), + (void*)&buffers_[idx]); + error_ |= _wrapper->clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL, + gws, lws, 1, &w[idx], &x[idx]); + if (!profEnable) { + error_ |= _wrapper->clReleaseEvent(w[idx]); + } + profile.addEvent(q, ProfileQueue::Execute, x[idx]); + } + _wrapper->clFlush(cmdQueues[q]); + + // Change the queue + ++q %= numQueues; + error_ |= _wrapper->clEnqueueReadBuffer( + cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s, + (char*)Data_s + s_done, 1, (useKernel) ? &x[idx] : &w[idx], &r[idx]); + if (!profEnable) { + error_ |= _wrapper->clReleaseEvent((useKernel) ? x[idx] : w[idx]); + } + profile.addEvent(q, ProfileQueue::Read, r[idx]); + _wrapper->clFlush(cmdQueues[q]); + + if ((s_done += size_s) >= size_S) { + if (!profEnable) { + error_ |= _wrapper->clReleaseEvent(r[idx]); + } + break; + } + ++iter; + ++idx %= numBufs; + ++q %= numQueues; + } + + for (q = 0; q < numQueues; ++q) { + error_ |= _wrapper->clFinish(cmdQueues[q]); + } + timer.Stop(); + + error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues[0], buffers_[numBufs], + Data_s, 0, NULL, NULL); + + error_ |= _wrapper->clFinish(cmdQueues[0]); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "Execution failed"); + + cl_long gpuTimeFrame = profile.findExecTime(); + cl_long oneIter = gpuTimeFrame / iter; + + // Display 4 iterations in the middle + cl_long startFrame = oneIter * (iter / 2 - 2); + cl_long finishFrame = oneIter * (iter / 2 + 2); + profile.display(startFrame, finishFrame); + + for (q = 0; q < numQueues; ++q) { + error_ = _wrapper->clReleaseCommandQueue(cmdQueues[q]); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), + "clReleaseCommandQueue() failed"); + } + + double GBytes = (double)(2 * size_S) / (double)(1000 * 1000 * 1000); + _perfInfo = static_cast(GBytes / timer.GetElapsedTime()); + + std::stringstream stream; + if (useKernel) { + stream << "Write/Kernel/Read operation "; + } else { + stream << "Write/Read operation "; + } + stream << numQueues << " queues; profiling " + << ((profEnable) ? "enabled" : "disabled") << " [GB/s]"; + + stream.flags(std::ios::right | std::ios::showbase); + testDescString = stream.str(); +} + +unsigned int OCLPerfDoubleDMA::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMA.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMA.h new file mode 100644 index 0000000000..5eb0d6d060 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMA.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PERF_DOUBLE_DMA_H_ +#define _OCL_PERF_DOUBLE_DMA_H_ + +#include "OCLTestImp.h" + +class OCLPerfDoubleDMA : public OCLTestImp { + public: + OCLPerfDoubleDMA(); + virtual ~OCLPerfDoubleDMA(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; + unsigned int test_; +}; + +#endif // _OCL_PERF_DOUBLE_DMA_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMASeq.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMASeq.cpp new file mode 100644 index 0000000000..049253d35c --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMASeq.cpp @@ -0,0 +1,291 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfDoubleDMASeq.h" + +#include +#include +#include + +#include +#include +#include + +#include "CL/cl.h" +#include "CL/cl_ext.h" + +#ifdef _WIN32 +const size_t blockX = 128; +const size_t blockY = 128; +const size_t blockZ = 256; +#else +const size_t blockX = 256; +const size_t blockY = 256; +const size_t blockZ = 512; +#endif + +const size_t chunk = 16; +const size_t size_S = blockX * blockY * blockZ * sizeof(cl_float4); +const size_t size_s = blockX * blockY * chunk * sizeof(cl_float4); +static const int WindowWidth = 80; + +const size_t MaxQueues = 3; + +static const char *strKernel = + "__kernel void dummy(__global float4* out) \n" + "{ \n" + " uint id = get_global_id(0); \n" + " float4 value = (float4)(1.0f, 2.0f, 3.0f, 4.0f); \n" + " uint factorial = 1; \n" + " for (uint i = 1; i < (id / 0x400); ++i)\n" + " { \n" + " factorial *= i; \n" + " } \n" + " out[id] = value * factorial; \n" + "} \n"; + +OCLPerfDoubleDMASeq::OCLPerfDoubleDMASeq() { + _numSubTests = MaxQueues * 2; + failed_ = false; +} + +OCLPerfDoubleDMASeq::~OCLPerfDoubleDMASeq() {} + +void OCLPerfDoubleDMASeq::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + _deviceId = deviceId; + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + test_ = test; + cl_device_type deviceType; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE, + sizeof(deviceType), &deviceType, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed"); + + if (!(deviceType & CL_DEVICE_TYPE_GPU)) { + printf("GPU device is required for this test!\n"); + failed_ = true; + return; + } + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + kernel_ = _wrapper->clCreateKernel(program_, "dummy", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + size_t bufSize = size_s; + cl_mem buffer; + test_ %= MaxQueues; + events_ = ((test / MaxQueues) == 0) ? false : true; + size_t numBufs = (test_ % MaxQueues) + 1; + for (size_t b = 0; b < numBufs; ++b) { + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, bufSize, + NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + } + + buffer = _wrapper->clCreateBuffer(context_, + CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, + size_S, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfDoubleDMASeq::run(void) { + if (failed_) { + return; + } + CPerfCounter timer; + const int numQueues = (test_ % MaxQueues) + 1; + const int numBufs = numQueues; + + std::vector cmdQueues(numQueues); + int q; + cl_command_queue_properties qProp = 0; + for (q = 0; q < numQueues; ++q) { + cl_command_queue cmdQueue = _wrapper->clCreateCommandQueue( + context_, devices_[_deviceId], qProp, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed"); + cmdQueues[q] = cmdQueue; + } + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "Execution failed"); + + float *Data_s = (float *)_wrapper->clEnqueueMapBuffer( + cmdQueues[0], buffers_[numBufs], CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, + size_S, 0, NULL, NULL, &error_); + + size_t gws[1] = {size_s / (4 * sizeof(float))}; + size_t lws[1] = {256}; + + // Warm-up + for (q = 0; q < numQueues; ++q) { + error_ |= + _wrapper->clEnqueueWriteBuffer(cmdQueues[q], buffers_[q], CL_FALSE, 0, + size_s, (char *)Data_s, 0, NULL, NULL); + error_ |= _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), + (void *)&buffers_[q]); + error_ |= _wrapper->clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL, + gws, lws, 0, NULL, NULL); + error_ |= + _wrapper->clEnqueueReadBuffer(cmdQueues[q], buffers_[q], CL_FALSE, 0, + size_s, (char *)Data_s, 0, NULL, NULL); + error_ |= _wrapper->clFinish(cmdQueues[q]); + } + + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "Execution failed"); + + size_t s_done = 0; + cl_event x[MaxQueues] = {0}; + + /*---------- pass2: copy Data_s to and from GPU Buffers ----------*/ + s_done = 0; + timer.Reset(); + timer.Start(); + int idx = numBufs - 1; + // Start from the last so read/write won't go to the same DMA when kernel is + // executed + q = numQueues - 1; + size_t iter = 0; + if (events_) { + while (1) { + error_ |= _wrapper->clEnqueueWriteBuffer( + cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s, + (char *)Data_s + s_done, 0, NULL, NULL); + + // Implicit flush of DMA engine on kernel start, because memory dependency + error_ |= _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), + (void *)&buffers_[idx]); + int prevQ; + if (q == 0) { + prevQ = numQueues - 1; + } else { + prevQ = q - 1; + } + if ((x[prevQ] != NULL) && (numQueues != 1)) { + error_ |= _wrapper->clEnqueueNDRangeKernel( + cmdQueues[q], kernel_, 1, NULL, gws, lws, 1, &x[prevQ], &x[q]); + error_ |= _wrapper->clReleaseEvent(x[prevQ]); + x[prevQ] = NULL; + } else { + error_ |= _wrapper->clEnqueueNDRangeKernel( + cmdQueues[q], kernel_, 1, NULL, gws, lws, 0, NULL, &x[q]); + if (numQueues == 1) { + error_ |= _wrapper->clReleaseEvent(x[q]); + x[q] = NULL; + } + } + error_ |= _wrapper->clFlush(cmdQueues[q]); + + // Change the queue + error_ |= _wrapper->clEnqueueReadBuffer( + cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s, + (char *)Data_s + s_done, 0, NULL, NULL); + + if ((s_done += size_s) >= size_S) { + break; + } + + error_ |= _wrapper->clFlush(cmdQueues[q]); + ++iter; + ++idx %= numBufs; + ++q %= numQueues; + } + for (q = 0; q < numQueues; ++q) { + if (x[q] != NULL) { + error_ |= _wrapper->clReleaseEvent(x[q]); + } + } + } else { + while (1) { + error_ |= _wrapper->clEnqueueWriteBuffer( + cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s, + (char *)Data_s + s_done, 0, NULL, NULL); + + // Implicit flush of DMA engine on kernel start, because memory dependency + error_ |= _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), + (void *)&buffers_[idx]); + error_ |= _wrapper->clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL, + gws, lws, 0, NULL, NULL); + + // Change the queue + error_ |= _wrapper->clEnqueueReadBuffer( + cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s, + (char *)Data_s + s_done, 0, NULL, NULL); + + if ((s_done += size_s) >= size_S) { + break; + } + + error_ |= _wrapper->clFlush(cmdQueues[q]); + ++iter; + ++idx %= numBufs; + ++q %= numQueues; + } + } + + for (q = 0; q < numQueues; ++q) { + error_ |= _wrapper->clFinish(cmdQueues[q]); + } + timer.Stop(); + + error_ |= _wrapper->clEnqueueUnmapMemObject(cmdQueues[0], buffers_[numBufs], + Data_s, 0, NULL, NULL); + + error_ |= _wrapper->clFinish(cmdQueues[0]); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "Execution failed"); + + for (q = 0; q < numQueues; ++q) { + error_ = _wrapper->clReleaseCommandQueue(cmdQueues[q]); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), + "clReleaseCommandQueue() failed"); + } + + double GBytes = (double)(2 * size_S) / (double)(1000 * 1000 * 1000); + _perfInfo = static_cast(GBytes / timer.GetElapsedTime()); + + std::stringstream stream; + stream << "Write/Kernel/Read operation "; + + stream << numQueues << " queues "; + if (events_) { + stream << " (use events) "; + } + stream << " [GB/s]"; + + stream.flags(std::ios::right | std::ios::showbase); + testDescString = stream.str(); +} + +unsigned int OCLPerfDoubleDMASeq::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMASeq.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMASeq.h new file mode 100644 index 0000000000..7569233798 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfDoubleDMASeq.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PERF_DOUBLE_DMA_SEQ_H_ +#define _OCL_PERF_DOUBLE_DMA_SEQ_H_ + +#include "OCLTestImp.h" + +class OCLPerfDoubleDMASeq : public OCLTestImp { + public: + OCLPerfDoubleDMASeq(); + virtual ~OCLPerfDoubleDMASeq(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; + unsigned int test_; + bool events_; +}; + +#endif // _OCL_PERF_DOUBLE_DMA_SEQ_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillBuffer.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillBuffer.cpp new file mode 100644 index 0000000000..e090a768ad --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillBuffer.cpp @@ -0,0 +1,114 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfFillBuffer.h" + +#include +#include +#include + +#include +#include + +#include "CL/cl.h" +#include "CL/cl_ext.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +static size_t typeSizeList[] = { + 1, // sizeof(cl_uchar) + 2, 4, 8, 16, 32, 64, + 128, // sizeof(cl_ulong16) +}; + +static unsigned int eleNumList[] = { + 0x0020000, 0x0080000, 0x0200000, 0x0800000, 0x2000000, +}; + +OCLPerfFillBuffer::OCLPerfFillBuffer() { + num_typeSize_ = sizeof(typeSizeList) / sizeof(size_t); + num_elements_ = sizeof(eleNumList) / sizeof(unsigned int); + _numSubTests = num_elements_ * num_typeSize_; + failed_ = false; + skip_ = false; +} + +OCLPerfFillBuffer::~OCLPerfFillBuffer() {} + +void OCLPerfFillBuffer::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + testTypeSize_ = typeSizeList[(test / num_elements_) % num_typeSize_]; + testNumEle_ = eleNumList[test % num_elements_]; + + bufSize_ = testNumEle_ * 4; + + buffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, bufSize_, 0, + &error_); + CHECK_RESULT(buffer_ == 0, "clCreateBuffer(buffer_) failed"); + + return; +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfFillBuffer::run(void) { + CPerfCounter timer; + size_t iter = 100; + + void *data = malloc(testTypeSize_); + + timer.Reset(); + timer.Start(); + for (size_t i = 0; i < iter; ++i) { + error_ = clEnqueueFillBuffer(cmdQueues_[_deviceId], buffer_, data, + testTypeSize_, 0, bufSize_, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueFillBuffer() failed"); + } + _wrapper->clFinish(cmdQueues_[_deviceId]); + timer.Stop(); + + char buf[256]; + + SNPRINTF(buf, sizeof(buf), "FillBuffer (GB/s) for %6d KB, typeSize:%3d", + (int)bufSize_ / 1024, (int)testTypeSize_); + + testDescString = buf; + double sec = timer.GetElapsedTime(); + _perfInfo = static_cast((bufSize_ * iter * (double)(1e-09)) / sec); +} + +unsigned int OCLPerfFillBuffer::close(void) { + if (buffer_) { + error_ = _wrapper->clReleaseMemObject(buffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(buffer) failed"); + } + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillBuffer.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillBuffer.h new file mode 100644 index 0000000000..afd6d0caea --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillBuffer.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PERF_FILL_BUFFER_H_ +#define _OCL_PERF_FILL_BUFFER_H_ + +#include "OCLTestImp.h" + +class OCLPerfFillBuffer : public OCLTestImp { + public: + OCLPerfFillBuffer(); + virtual ~OCLPerfFillBuffer(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + cl_mem buffer_; + unsigned int bufSize_; + unsigned int num_typeSize_; + unsigned int num_elements_; + size_t testTypeSize_; + unsigned int testNumEle_; + bool failed_; + bool skip_; +}; + +#endif // _OCL_PERF_FILL_BUFFER_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillImage.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillImage.cpp new file mode 100644 index 0000000000..7de92cc1a9 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillImage.cpp @@ -0,0 +1,109 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfFillImage.h" + +#include +#include +#include + +#include +#include + +#include "CL/cl.h" +#include "CL/cl_ext.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +static unsigned int sizeList[] = { + 256, 512, 1024, 2048, 4096, 8192, +}; + +OCLPerfFillImage::OCLPerfFillImage() { + num_sizes_ = sizeof(sizeList) / sizeof(unsigned int); + _numSubTests = num_sizes_; + failed_ = false; + skip_ = false; +} + +OCLPerfFillImage::~OCLPerfFillImage() {} + +void OCLPerfFillImage::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + bufSize_ = sizeList[test % num_sizes_]; + + cl_image_format format = {CL_RGBA, CL_UNSIGNED_INT8}; + buffer_ = _wrapper->clCreateImage2D(context_, CL_MEM_WRITE_ONLY, &format, + bufSize_, bufSize_, 0, NULL, &error_); + CHECK_RESULT(buffer_ == 0, "clCreateImage2D(imageBuffer_) failed"); + + return; +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfFillImage::run(void) { + CPerfCounter timer; + size_t iter = 100; + + cl_uint4 fillColor = {1, 1, 1, 1}; + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {bufSize_, bufSize_, 1}; + + timer.Reset(); + timer.Start(); + for (size_t i = 0; i < iter; ++i) { + error_ = clEnqueueFillImage(cmdQueues_[_deviceId], buffer_, + (const void *)&fillColor, origin, region, 0, + NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueFillImage() failed"); + } + _wrapper->clFinish(cmdQueues_[_deviceId]); + timer.Stop(); + + char buf[256]; + + SNPRINTF(buf, sizeof(buf), "FillImage (GB/s) for %4dx%4d ", (int)bufSize_, + (int)bufSize_); + + testDescString = buf; + double sec = timer.GetElapsedTime(); + _perfInfo = static_cast( + (bufSize_ * bufSize_ * 4 * iter * (double)(1e-09)) / sec); +} + +unsigned int OCLPerfFillImage::close(void) { + if (buffer_) { + error_ = _wrapper->clReleaseMemObject(buffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(buffer) failed"); + } + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillImage.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillImage.h new file mode 100644 index 0000000000..5313e7941c --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFillImage.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PERF_FILL_IMAGE_H_ +#define _OCL_PERF_FILL_IMAGE_H_ + +#include "OCLTestImp.h" + +class OCLPerfFillImage : public OCLTestImp { + public: + OCLPerfFillImage(); + virtual ~OCLPerfFillImage(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + cl_mem buffer_; + unsigned int bufSize_; + unsigned int num_sizes_; + bool failed_; + bool skip_; +}; + +#endif // _OCL_PERF_FILL_IMAGE_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFlush.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFlush.cpp new file mode 100644 index 0000000000..c38f73e01d --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFlush.cpp @@ -0,0 +1,165 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfFlush.h" + +#include +#include +#include + +#include +#include + +#include "CL/cl.h" + +static const cl_uint Iterations = 0x10000; +static const cl_uint IterationDivider = 2; +static const size_t MaxBuffers = IterationDivider; +static size_t BufSize = 0x1000; + +const static char* strKernel = + "__kernel void factorial(__global uint* out) \n" + "{ \n" + " uint id = get_global_id(0); \n" + " uint factorial = 1; \n" + " for (uint i = 1; i < (id / 0x10000); ++i) \n" + " { \n" + " factorial *= i; \n" + " } \n" + " out[id] = factorial; \n" + "} \n"; + +unsigned int NumTests = 3; + +OCLPerfFlush::OCLPerfFlush() { + _numSubTests = NumTests; + failed_ = false; +} + +OCLPerfFlush::~OCLPerfFlush() {} + +void OCLPerfFlush::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + test_ = test; + + cl_device_type deviceType; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE, + sizeof(deviceType), &deviceType, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed"); + + if (!(deviceType & CL_DEVICE_TYPE_GPU)) { + printf("GPU device is required for this test!\n"); + failed_ = true; + return; + } + size_t maxWorkGroupSize = 1; + cl_uint computePower = 1; + error_ = _wrapper->clGetDeviceInfo( + devices_[deviceId], CL_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL); + computePower *= static_cast(maxWorkGroupSize); + cl_uint maxComputeUnits = 1; + error_ = _wrapper->clGetDeviceInfo( + devices_[deviceId], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(maxComputeUnits), + &maxComputeUnits, NULL); + computePower *= 32 * maxComputeUnits; + BufSize = (BufSize < static_cast(computePower)) + ? static_cast(computePower) + : BufSize; + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + kernel_ = _wrapper->clCreateKernel(program_, "factorial", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + cl_mem buffer; + for (size_t i = 0; i < MaxBuffers; ++i) { + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + BufSize * sizeof(cl_uint), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + } +} + +void OCLPerfFlush::run(void) { + if (failed_) { + return; + } + for (size_t y = 0; y < IterationDivider; ++y) { + cl_mem buffer = buffers()[y]; + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + size_t gws[1] = {BufSize}; + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + _wrapper->clFinish(cmdQueues_[_deviceId]); + + CPerfCounter timer; + const char* descriptions[] = { + "Single batch: ", "clFlush(): ", "clFinish(): "}; + + timer.Reset(); + timer.Start(); + cl_uint x; + for (x = 0; x < Iterations / IterationDivider; x++) { + for (size_t y = 0; y < IterationDivider; ++y) { + cl_mem buffer = buffers()[y]; + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + size_t gws[1] = {BufSize}; + error_ = _wrapper->clEnqueueNDRangeKernel( + cmdQueues_[_deviceId], kernel_, 1, NULL, gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + if (test_ == 1) { + _wrapper->clFlush(cmdQueues_[_deviceId]); + } else if (test_ == 2) { + _wrapper->clFinish(cmdQueues_[_deviceId]); + } + } + _wrapper->clFinish(cmdQueues_[_deviceId]); + timer.Stop(); + + std::stringstream stream; + stream << "Loop[" << std::hex << Iterations << "], " << descriptions[test_]; + stream << "(sec)"; + testDescString = stream.str(); + _perfInfo = static_cast(timer.GetElapsedTime()); +} + +unsigned int OCLPerfFlush::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFlush.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFlush.h new file mode 100644 index 0000000000..06c71c7354 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfFlush.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PERF_FLUSH_H_ +#define _OCL_PERF_FLUSH_H_ + +#include "OCLTestImp.h" + +class OCLPerfFlush : public OCLTestImp { + public: + OCLPerfFlush(); + virtual ~OCLPerfFlush(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; + unsigned int test_; +}; + +#endif // _OCL_PERF_FLUSH_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.cpp new file mode 100644 index 0000000000..2cc45d7e61 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.cpp @@ -0,0 +1,309 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfGenericBandwidth.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 4 +// 256KB, 1 MB, 4MB, 16 MB +static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304, + 16777216}; + +void OCLPerfGenericBandwidth::genShader(unsigned int idx) { + shader_.clear(); + if (idx == 0) { + shader_ += + "__kernel __attribute__((reqd_work_group_size(64,1,1))) void " + "_genericReadSpeed(global float *outBuf, global float *inBuf, local " + "float *inLocal, float c, char useLocal)\n" + "{\n" + " int gid = (int) get_global_id(0);\n" + " int lid = (int) get_local_id(0);\n" + " float val0 = 0.0f;\n" + " float val1 = 0.0f;\n" + " float *localLocal;\n" + " int hacklid = gid % 64;\n" + " if (useLocal)\n" + " localLocal = inLocal;\n" + " else\n" + " localLocal = inBuf;\n" + " for (int i = 0; i < (768/64); i++) {\n" + " localLocal[hacklid + i*64] = lid;\n" + " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + "#pragma nounroll\n" + " for (uint i = 0; i < 32;i++)\n" + " {\n" + " val0 += localLocal[lid+0];\n" + " val1 += localLocal[lid+64];\n" + " val0 += localLocal[lid+128];\n" + " val1 += localLocal[lid+192];\n" + " val0 += localLocal[lid+256];\n" + " val1 += localLocal[lid+320];\n" + " val0 += localLocal[lid+384];\n" + " val1 += localLocal[lid+448];\n" + " lid += 1;\n" + " }\n" + " val0 += val1;\n" + " val1 = min(val0,1.0f);\n" + " if ((lid + val1) < 0){\n" + " outBuf[gid] = val0;\n" + " }\n" + "}\n"; + dataSizeBytes_ = 768 * 4; + } else { + shader_ += + "__kernel __attribute__((reqd_work_group_size(64,1,1))) void " + "_genericReadSpeed(global float *outBuf, global float *inBuf, local " + "float *inLocal, float c, char useLocal)\n" + "{\n" + " uint gid = (uint) get_global_id(0);\n" + " int lid = (int) get_local_id(0);\n" + " float val0 = 0.0f;\n" + " float val1 = 0.0f;\n" + " float *localLocal;\n" + " uint hacklid = gid % 64;\n" + " if (useLocal)\n" + " localLocal = inLocal;\n" + " else\n" + " localLocal = inBuf;\n" + " for (int i = 0; i < (256/64); i++) {\n" + " localLocal[hacklid + i*64] = lid;\n" + " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " #pragma nounroll\n" + " for (uint i = 0; i < 32;i++)\n" + " {\n" + " val0 += localLocal[8*i+0];\n" + " val1 += localLocal[8*i+1];\n" + " val0 += localLocal[8*i+2];\n" + " val1 += localLocal[8*i+3];\n" + " val0 += localLocal[8*i+4];\n" + " val1 += localLocal[8*i+5];\n" + " val0 += localLocal[8*i+6];\n" + " val1 += localLocal[8*i+7];\n" + " }\n" + " val0 += val1;\n" + " val1 = min(val0,1.0f);\n" + " if ((lid + val1) < 0){\n" + " outBuf[gid] = val0;\n" + " }\n" + "}\n"; + dataSizeBytes_ = 256 * 4; + } +} + +OCLPerfGenericBandwidth::OCLPerfGenericBandwidth() { + _numSubTests = NUM_SIZES * 4; +} + +OCLPerfGenericBandwidth::~OCLPerfGenericBandwidth() {} + +void OCLPerfGenericBandwidth::setData(cl_mem buffer, float val) { + float *data = (float *)_wrapper->clEnqueueMapBuffer( + cmdQueues_[_deviceId], buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, + NULL, &error_); + for (unsigned int i = 0; i < (bufSize_ >> 2); i++) data[i] = val; + error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], buffer, + data, 0, NULL, NULL); + _wrapper->clFinish(cmdQueues_[_deviceId]); +} + +void OCLPerfGenericBandwidth::checkData(cl_mem buffer) { + float *data = (float *)_wrapper->clEnqueueMapBuffer( + cmdQueues_[_deviceId], buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, + NULL, &error_); + for (unsigned int i = 0; i < (bufSize_ >> 2); i++) { + if (data[i] != (float)numReads_) { + printf("Data validation failed at index %d!\n", i); + printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, numReads_, + numReads_, numReads_, (unsigned int)data[i], + (unsigned int)data[i + 1], (unsigned int)data[i + 2], + (unsigned int)data[i + 3]); + CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n"); + break; + } + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], buffer, + data, 0, NULL, NULL); + _wrapper->clFinish(cmdQueues_[_deviceId]); +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfGenericBandwidth::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + _crcword = 0; + conversion = 1.0f; + + failed = false; + kernel_ = 0; + inBuffer_ = 0; + outBuffer_ = 0; + useLDS_ = ((test / NUM_SIZES) % 2) == 0 ? 1 : 0; + + size_t param_size = 0; + char *strVersion = 0; + error_ = _wrapper->clGetDeviceInfo( + devices_[_deviceId], CL_DEVICE_OPENCL_C_VERSION, 0, 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strVersion = new char[param_size]; + error_ = + _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_OPENCL_C_VERSION, + param_size, strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (strVersion[9] < '2') { + failed = true; + return; + } + delete strVersion; + + numReads_ = 32; + width_ = Sizes[test % NUM_SIZES]; + shaderIdx_ = test / (NUM_SIZES * 2); + + bufSize_ = width_; + + inBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed"); + + outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + genShader(shaderIdx_); + char *tmp = (char *)shader_.c_str(); + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&tmp, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], + "-cl-std=CL2.0", NULL, NULL); + + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "_genericReadSpeed", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + + float foo = 0; + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer_); + error_ = + _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&inBuffer_); + error_ = _wrapper->clSetKernelArg(kernel_, 2, 1024 * sizeof(cl_float), + (void *)NULL); + error_ = _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_float), (void *)&foo); + error_ = + _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_char), (void *)&useLDS_); + + setData(outBuffer_, 1.2345678f); +} + +void OCLPerfGenericBandwidth::run(void) { + if (failed) return; + int global = bufSize_ / sizeof(cl_float); + int local = 64; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + + CPerfCounter timer; + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < NUM_ITER; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmdQueues_[_deviceId], kernel_, 1, NULL, + (const size_t *)global_work_size, (const size_t *)local_work_size, 0, + NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + } + _wrapper->clFinish(cmdQueues_[_deviceId]); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + char buf[256]; + const char *buf2; + if (useLDS_) + buf2 = "LDS"; + else + buf2 = "global"; + const char *buf3; + if (shaderIdx_ == 0) { + buf3 = "reads"; + numReads_ *= 8; + } else { + buf3 = "broadcast"; + numReads_ *= 8; + } + // LDS bandwidth in GB/s + // We have one extra write per LDS location to initialize LDS + double perf = + ((double)global * (numReads_ * sizeof(cl_float) + dataSizeBytes_ / 64) * + NUM_ITER * (double)(1e-09)) / + sec; + + _perfInfo = (float)perf; + SNPRINTF(buf, sizeof(buf), " %6s %9s %8d threads, %3d reads (GB/s) ", buf2, + buf3, global, numReads_); + testDescString = buf; + // checkData(outBuffer_); +} + +unsigned int OCLPerfGenericBandwidth::close(void) { + if (inBuffer_) { + error_ = _wrapper->clReleaseMemObject(inBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(inBuffer_) failed"); + } + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.h new file mode 100644 index 0000000000..6898fc0f88 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenericBandwidth.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_GenericBandwidth_H_ +#define _OCL_GenericBandwidth_H_ + +#include "OCLTestImp.h" + +class OCLPerfGenericBandwidth : public OCLTestImp { + public: + OCLPerfGenericBandwidth(); + virtual ~OCLPerfGenericBandwidth(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void genShader(unsigned int idx); + void setData(cl_mem buffer, float data); + void checkData(cl_mem buffer); + + static const unsigned int NUM_ITER = 100; + + cl_mem inBuffer_; + cl_mem outBuffer_; + + unsigned int width_; + unsigned int bufSize_; + unsigned int vecSizeIdx_; + unsigned int numReads_; + unsigned int shaderIdx_; + unsigned int dataSizeBytes_; + cl_char useLDS_; + bool failed; +}; + +#endif // _OCL_GenericBandwidth_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenoilSiaMiner.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenoilSiaMiner.cpp new file mode 100644 index 0000000000..0c92f2d638 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenoilSiaMiner.cpp @@ -0,0 +1,429 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfGenoilSiaMiner.h" + +#include +#include +#include + +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_INTENSITY 15 + +static const unsigned int intensities[NUM_INTENSITY] = { + DEFAULT_INTENSITY, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31}; + +static const char *siaKernel = + " inline static uint2 ror64(const uint2 x, const uint y) " + " \n" + " { " + " \n" + " return " + "(uint2)(((x).x>>y)^((x).y<<(32-y)),((x).y>>y)^((x).x<<(32-y))); " + " \n" + " } " + " \n" + " inline static uint2 ror64_2(const uint2 x, const uint y) " + " \n" + " { " + " \n" + " return " + "(uint2)(((x).y>>(y-32))^((x).x<<(64-y)),((x).x>>(y-32))^((x).y<<(64-y))); " + " \n" + " } " + " \n" + " __constant static const uchar blake2b_sigma[12][16] = { " + " \n" + " { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } " + ", \n" + " { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } " + ", \n" + " { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } " + ", \n" + " { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } " + ", \n" + " { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } " + ", \n" + " { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } " + ", \n" + " { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } " + ", \n" + " { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } " + ", \n" + " { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } " + ", \n" + " { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 } " + ", \n" + " { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } " + ", \n" + " { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } " + "}; \n" + " // Target is passed in via headerIn[32 - 29] " + " \n" + " __kernel void nonceGrind(__global ulong *headerIn, __global ulong " + "*nonceOut) { \n" + " ulong target = headerIn[4]; " + " \n" + " ulong m[16] = { headerIn[0], headerIn[1], " + " \n" + " headerIn[2], headerIn[3], " + " \n" + " (ulong)get_global_id(0), headerIn[5], " + " \n" + " headerIn[6], headerIn[7], " + " \n" + " headerIn[8], headerIn[9], 0, 0, 0, 0, 0, 0 }; " + " \n" + " ulong v[16] = { 0x6a09e667f2bdc928, 0xbb67ae8584caa73b, " + "0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, \n" + " 0x510e527fade682d1, 0x9b05688c2b3e6c1f, " + "0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, \n" + " 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, " + "0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, \n" + " 0x510e527fade68281, 0x9b05688c2b3e6c1f, " + "0xe07c265404be4294, 0x5be0cd19137e2179 }; \n" + " #define G(r,i,a,b,c,d) \\\n" + " a = a + b + m[ blake2b_sigma[r][2*i] ]; \\\n" + " ((uint2*)&d)[0] = ((uint2*)&d)[0].yx ^ ((uint2*)&a)[0].yx; \\\n" + " c = c + d; \\\n" + " ((uint2*)&b)[0] = ror64( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 24U); " + "\\\n" + " a = a + b + m[ blake2b_sigma[r][2*i+1] ]; \\\n" + " ((uint2*)&d)[0] = ror64( ((uint2*)&d)[0] ^ ((uint2*)&a)[0], 16U); " + "\\\n" + " c = c + d; \\\n" + " ((uint2*)&b)[0] = ror64_2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], " + "63U);\n" + " #define ROUND(r) \\\n" + " G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \\\n" + " G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \\\n" + " G(r,2,v[ 2],v[ 6],v[10],v[14]); \\\n" + " G(r,3,v[ 3],v[ 7],v[11],v[15]); \\\n" + " G(r,4,v[ 0],v[ 5],v[10],v[15]); \\\n" + " G(r,5,v[ 1],v[ 6],v[11],v[12]); \\\n" + " G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \\\n" + " G(r,7,v[ 3],v[ 4],v[ 9],v[14]); " + " \n" + " ROUND( 0 ); " + " \n" + " ROUND( 1 ); " + " \n" + " ROUND( 2 ); " + " \n" + " ROUND( 3 ); " + " \n" + " ROUND( 4 ); " + " \n" + " ROUND( 5 ); " + " \n" + " ROUND( 6 ); " + " \n" + " ROUND( 7 ); " + " \n" + " ROUND( 8 ); " + " \n" + " ROUND( 9 ); " + " \n" + " ROUND( 10 ); " + " \n" + " ROUND( 11 ); " + " \n" + " #undef G " + " \n" + " #undef ROUND " + " \n" + " if (as_ulong(as_uchar8(0x6a09e667f2bdc928 ^ v[0] ^ " + "v[8]).s76543210) < target) { \n" + " *nonceOut = m[4]; " + " \n" + " return; " + " \n" + " } " + " \n" + " }\n"; + +OCLPerfGenoilSiaMiner::OCLPerfGenoilSiaMiner() { _numSubTests = NUM_INTENSITY; } + +OCLPerfGenoilSiaMiner::~OCLPerfGenoilSiaMiner() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfGenoilSiaMiner::setHeader(uint32_t *ptr) { + ptr[0] = 0x10; + for (unsigned int i = 1; i < 9; i++) { + ptr[i] = 0; + } + ptr[9] = 0x4a5e1e4b; + ptr[10] = 0xaab89f3a; + ptr[11] = 0x32518a88; + ptr[12] = 0xc31bc87f; + ptr[13] = 0x618f7667; + ptr[14] = 0x3e2cc77a; + ptr[15] = 0xb2127b7a; + ptr[16] = 0xfdeda33b; + ptr[17] = 0x495fab29; + ptr[18] = 0x1d00ffff; + ptr[19] = 0x7c2bac1d; +} + +void OCLPerfGenoilSiaMiner::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + // Parse args. + isAMD = false; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + isAMD = true; + } + // platform = platforms[_platformIndex]; + // break; + } +#if 0 + } +#endif + delete platforms; + } + + char getVersion[128]; + error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION, + sizeof(getVersion), getVersion, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed"); + platformVersion[0] = getVersion[7]; + platformVersion[1] = getVersion[8]; + platformVersion[2] = getVersion[9]; + platformVersion[3] = '\0'; + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + // Make sure the device can handle our local item size. + size_t max_group_size = 0; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof(size_t), &max_group_size, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + if (local_item_size > max_group_size) { + char buf[256]; + SNPRINTF(buf, sizeof(buf), + "Selected device cannot handle work groups larger than %zu.\n", + local_item_size); + local_item_size = max_group_size; + testDescString = buf; + } + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + // Create Buffer Objects. + blockHeadermobj_ = _wrapper->clCreateBuffer( + context_, CL_MEM_READ_ONLY, 80 * sizeof(uint8_t), NULL, &error_); + CHECK_RESULT(blockHeadermobj_ == 0, "clCreateBuffer(outBuffer) failed"); + nonceOutmobj_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + 8 * sizeof(uint8_t), NULL, &error_); + CHECK_RESULT(nonceOutmobj_ == 0, "clCreateBuffer(outBuffer) failed"); + + // Create kernel program from source file. + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&siaKernel, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &device, NULL, NULL, NULL); + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + // Create data parallel OpenCL kernel. + kernel_ = _wrapper->clCreateKernel(program_, "nonceGrind", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + + // Set OpenCL kernel arguments. + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), + (void *)&blockHeadermobj_); + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), + (void *)&nonceOutmobj_); +} + +void OCLPerfGenoilSiaMiner::run(void) { + CPerfCounter timer; + + uint8_t blockHeader[80]; + uint8_t target[32] = {255}; + uint8_t nonceOut[8] = {0}; + + setHeader((uint32_t *)blockHeader); + intensity = intensities[_openTest % NUM_INTENSITY]; + size_t global_item_size = 1ULL << intensity; + + timer.Reset(); + timer.Start(); + + // By doing a bunch of low intensity calls, we prevent freezing + // By splitting them up inside this function, we also avoid calling + // get_block_for_work too often. + for (unsigned int i = 0; i < cycles_per_iter; i++) { + // Offset global ids so that each loop call tries a different set of + // hashes. + size_t globalid_offset = i * global_item_size; + + // Copy input data to the memory buffer. + error_ = + clEnqueueWriteBuffer(cmd_queue_, blockHeadermobj_, CL_TRUE, 0, + 80 * sizeof(uint8_t), blockHeader, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueWriteBuffer failed"); + + error_ = clEnqueueWriteBuffer(cmd_queue_, nonceOutmobj_, CL_TRUE, 0, + 8 * sizeof(uint8_t), nonceOut, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueWriteBuffer failed"); + + // Run the kernel. + error_ = clEnqueueNDRangeKernel(cmd_queue_, kernel_, 1, &globalid_offset, + &global_item_size, &local_item_size, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + + // Copy result to host and see if a block was found. + error_ = clEnqueueReadBuffer(cmd_queue_, nonceOutmobj_, CL_TRUE, 0, + 8 * sizeof(uint8_t), nonceOut, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueReadBuffer failed"); + + // if (nonceOut[0] != 0) { + // // Copy nonce to header. + // memcpy(blockHeader + 32, nonceOut, 8); + // break; + //} + } + _wrapper->clFinish(cmd_queue_); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Hash rate calculation MH/s + double hash_rate = cycles_per_iter * global_item_size / (sec * 1000000); + + _perfInfo = (float)hash_rate; + char buf[256]; + SNPRINTF(buf, sizeof(buf), + " (%4d cycles) Work_items:%10zu Intensity:%d (MH/s) ", + cycles_per_iter, global_item_size, intensity); + testDescString = buf; +} + +unsigned int OCLPerfGenoilSiaMiner::close(void) { + if (blockHeadermobj_) { + error_ = _wrapper->clReleaseMemObject(blockHeadermobj_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(blockHeadermobj_) failed"); + } + if (nonceOutmobj_) { + error_ = _wrapper->clReleaseMemObject(nonceOutmobj_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(nonceOutmobj_) failed"); + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenoilSiaMiner.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenoilSiaMiner.h new file mode 100644 index 0000000000..0d2f77b454 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfGenoilSiaMiner.h @@ -0,0 +1,78 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_GenoilSiaMiner_H_ +#define _OCL_GenoilSiaMiner_H_ + +#include "OCLTestImp.h" + +class OCLPerfGenoilSiaMiner : public OCLTestImp { + public: + OCLPerfGenoilSiaMiner(); + virtual ~OCLPerfGenoilSiaMiner(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 1000; + // 2^intensity hashes are calculated each time the kernel is called + // Minimum of 2^8 (256) because our default local_item_size is 256 + // global_item_size (2^intensity) must be a multiple of local_item_size + // Max of 2^32 so that people can't send an hour of work to the GPU at one + // time +#define MIN_INTENSITY 8 +#define MAX_INTENSITY 32 +#define DEFAULT_INTENSITY 16 + + // Number of times the GPU kernel is called between updating the command line + // text +#define MIN_CPI 1 // Must do one call per update +#define MAX_CPI 65536 // 2^16 is a slightly arbitrary max +#define DEFAULT_CPI 30 + + // The maximum size of the .cl file we read in and compile +#define MAX_SOURCE_SIZE (0x200000) + + cl_context context_; + cl_command_queue cmd_queue_; + cl_int error_; + cl_program program_; + cl_kernel kernel_; + + // mem objects for storing our kernel parameters + cl_mem blockHeadermobj_ = NULL; + cl_mem nonceOutmobj_ = NULL; + + // More gobal variables the grindNonce needs to access + size_t local_item_size = + 256; // Size of local work groups. 256 is usually optimal + unsigned int blocks_mined = 0; + unsigned int intensity = DEFAULT_INTENSITY; + unsigned cycles_per_iter = DEFAULT_CPI; + + bool isAMD; + char platformVersion[32]; + void setHeader(uint32_t* ptr); +}; + +#endif // _OCL_GenoilSiaMiner_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopyCorners.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopyCorners.cpp new file mode 100644 index 0000000000..f8f3280441 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopyCorners.cpp @@ -0,0 +1,367 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfImageCopyCorners.h" + +#include +#include +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 2 +static const unsigned int Sizes0[NUM_SIZES] = {512, 16384}; +static const unsigned int Sizes1[NUM_SIZES] = {16384, 512}; + +#define NUM_FORMATS 3 +static const cl_image_format formats[NUM_FORMATS] = { + {CL_RGBA, CL_UNSIGNED_INT8}, + {CL_R, CL_UNSIGNED_INT32}, + {CL_RGBA, CL_UNSIGNED_INT32}}; +static const char *textFormats[NUM_FORMATS] = {"R8G8B8A8", "R32", + "R32G32B32A32"}; +static const unsigned int formatSize[NUM_FORMATS] = { + 4 * sizeof(cl_uchar), 1 * sizeof(cl_uint), 4 * sizeof(cl_uint)}; + +static const unsigned int Iterations[2] = {1, + OCLPerfImageCopyCorners::NUM_ITER}; + +#define NUM_SUBTESTS 3 +OCLPerfImageCopyCorners::OCLPerfImageCopyCorners() { + _numSubTests = NUM_SIZES * NUM_SUBTESTS * NUM_FORMATS * 2; +} + +OCLPerfImageCopyCorners::~OCLPerfImageCopyCorners() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfImageCopyCorners::setData(void *ptr, unsigned int pitch, + unsigned int size) { + unsigned int *ptr2 = (unsigned int *)ptr; + unsigned int value = 0; + for (unsigned int i = 0; i > 2; i++) { + ptr2[i] = value; + value++; + } +} + +void OCLPerfImageCopyCorners::checkData(void *ptr, unsigned int pitch, + unsigned int size) { + unsigned int *ptr2 = (unsigned int *)ptr; + unsigned int value = 0; + for (unsigned int i = 0; i < size >> 2; i++) { + if (ptr2[i] != value) { + printf("Data validation failed at %d! Got 0x%08x 0x%08x 0x%08x 0x%08x\n", + i, ptr2[i], ptr2[i + 1], ptr2[i + 2], ptr2[i + 3]); + printf("Expected 0x%08x 0x%08x 0x%08x 0x%08x\n", value, value, value, + value); + CHECK_RESULT(true, "Data validation failed!"); + break; + } + value++; + } +} + +void OCLPerfImageCopyCorners::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint typeOfDevice = type_; + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + size_t queryOut = 0; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + srcBuffer_ = 0; + dstBuffer_ = 0; + srcImage_ = false; + dstImage_ = false; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice, + 0, NULL, &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + // if (num_devices > 0) + //{ + // platform = platforms[_platformIndex]; + // break; + //} +#if 0 + } +#endif + delete platforms; + } + + bufnum_ = (_openTest / (NUM_SIZES * NUM_SUBTESTS)) % NUM_FORMATS; + + if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) + 1) & 1) { + srcImage_ = true; + } + if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) + 1) & 2) { + dstImage_ = true; + } + + numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS * NUM_FORMATS)]; + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices, + devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + if (_openTest % NUM_SIZES) { + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, + sizeof(size_t), &queryOut, NULL); + bufSizeW_ = (cl_uint)queryOut; + bufSizeH_ = Sizes1[_openTest % NUM_SIZES]; + } else { + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, + sizeof(size_t), &queryOut, NULL); + bufSizeW_ = Sizes0[_openTest % NUM_SIZES]; + bufSizeH_ = (cl_uint)queryOut; + } + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_mem_flags flags = CL_MEM_WRITE_ONLY; + void *mem; + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {bufSizeW_, bufSizeH_, 1}; + size_t image_row_pitch; + size_t image_slice_pitch; + unsigned int memSize; + if (dstImage_) { + dstBuffer_ = + _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], bufSizeW_, + bufSizeH_, 0, NULL, &error_); + CHECK_RESULT(dstBuffer_ == 0, "clCreateImage(dstBuffer) failed"); + mem = _wrapper->clEnqueueMapImage( + cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region, + &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapImage failed"); + memSize = (unsigned int)image_row_pitch * bufSizeH_; + } else { + dstBuffer_ = _wrapper->clCreateBuffer( + context_, flags, bufSizeW_ * bufSizeH_ * formatSize[bufnum_], NULL, + &error_); + CHECK_RESULT(dstBuffer_ == 0, "clCreateBuffer(dstBuffer) failed"); + mem = _wrapper->clEnqueueMapBuffer( + cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_WRITE, 0, + bufSizeW_ * bufSizeH_ * formatSize[bufnum_], 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + memSize = (unsigned int)bufSizeW_ * bufSizeH_ * formatSize[bufnum_]; + image_row_pitch = 0; + } + setData(mem, (unsigned int)image_row_pitch, memSize); + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL); + + flags = CL_MEM_READ_ONLY; + if (srcImage_) { + srcBuffer_ = + _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], bufSizeW_, + bufSizeH_, 0, NULL, &error_); + CHECK_RESULT(srcBuffer_ == 0, "clCreateImage(srcBuffer) failed"); + mem = _wrapper->clEnqueueMapImage( + cmd_queue_, srcBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region, + &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapImage failed"); + memSize = (unsigned int)image_row_pitch * bufSizeH_; + } else { + srcBuffer_ = _wrapper->clCreateBuffer( + context_, flags, bufSizeW_ * bufSizeH_ * formatSize[bufnum_], NULL, + &error_); + CHECK_RESULT(srcBuffer_ == 0, "clCreateBuffer(srcBuffer) failed"); + mem = _wrapper->clEnqueueMapBuffer( + cmd_queue_, srcBuffer_, CL_TRUE, CL_MAP_WRITE, 0, + bufSizeW_ * bufSizeH_ * formatSize[bufnum_], 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + memSize = (unsigned int)bufSizeW_ * bufSizeH_ * formatSize[bufnum_]; + image_row_pitch = 0; + } + setData(mem, (unsigned int)image_row_pitch, memSize); + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, srcBuffer_, mem, 0, NULL, NULL); +} + +void OCLPerfImageCopyCorners::run(void) { + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {bufSizeW_, bufSizeH_, 1}; + + // Warm up + if (srcImage_ == false) { + error_ = _wrapper->clEnqueueCopyBufferToImage( + cmd_queue_, srcBuffer_, dstBuffer_, 0, origin, region, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyBufferToImage failed"); + } else if (dstImage_ == false) { + error_ = _wrapper->clEnqueueCopyImageToBuffer( + cmd_queue_, srcBuffer_, dstBuffer_, origin, region, 0, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyImageToBuffer failed"); + } else { + error_ = + _wrapper->clEnqueueCopyImage(cmd_queue_, srcBuffer_, dstBuffer_, origin, + origin, region, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyImage failed"); + } + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + + CPerfCounter timer; + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < numIter; i++) { + if (srcImage_ == false) { + error_ = _wrapper->clEnqueueCopyBufferToImage( + cmd_queue_, srcBuffer_, dstBuffer_, 0, origin, region, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyBufferToImage failed"); + } else if (dstImage_ == false) { + error_ = _wrapper->clEnqueueCopyImageToBuffer( + cmd_queue_, srcBuffer_, dstBuffer_, origin, region, 0, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyImageToBuffer failed"); + } else { + error_ = + _wrapper->clEnqueueCopyImage(cmd_queue_, srcBuffer_, dstBuffer_, + origin, origin, region, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyImage failed"); + } + } + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Image copy bandwidth in GB/s + double perf = ((double)bufSizeW_ * bufSizeH_ * formatSize[bufnum_] * 2 * + numIter * (double)(1e-09)) / + sec; + + const char *strSrc = NULL; + const char *strDst = NULL; + if (srcImage_) + strSrc = "img"; + else + strSrc = "buf"; + if (dstImage_) + strDst = "img"; + else + strDst = "buf"; + void *mem; + size_t image_row_pitch; + size_t image_slice_pitch; + unsigned int memSize; + if (dstImage_) { + mem = _wrapper->clEnqueueMapImage( + cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ, origin, region, + &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapImage failed"); + memSize = (unsigned int)image_row_pitch * bufSizeH_; + } else { + mem = _wrapper->clEnqueueMapBuffer( + cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ, 0, + bufSizeW_ * bufSizeH_ * formatSize[bufnum_], 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + memSize = (unsigned int)bufSizeW_ * bufSizeH_ * formatSize[bufnum_]; + image_row_pitch = 0; + } + checkData(mem, (unsigned int)image_row_pitch, memSize); + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL); + _perfInfo = (float)perf; + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s src:%s dst:%s i: %4d (GB/s) ", + bufSizeW_, bufSizeH_, textFormats[bufnum_], strSrc, strDst, numIter); + testDescString = buf; +} + +unsigned int OCLPerfImageCopyCorners::close(void) { + _wrapper->clFinish(cmd_queue_); + + if (srcBuffer_) { + error_ = _wrapper->clReleaseMemObject(srcBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(srcBuffer_) failed"); + } + if (dstBuffer_) { + error_ = _wrapper->clReleaseMemObject(dstBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(dstBuffer_) failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopyCorners.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopyCorners.h new file mode 100644 index 0000000000..7d761c8e13 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopyCorners.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_ImageCopyCorners_H_ +#define _OCL_ImageCopyCorners_H_ + +#include "OCLTestImp.h" + +class OCLPerfImageCopyCorners : public OCLTestImp { + public: + OCLPerfImageCopyCorners(); + virtual ~OCLPerfImageCopyCorners(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 10; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_mem srcBuffer_; + cl_mem dstBuffer_; + cl_int error_; + + unsigned int bufSizeW_; + unsigned int bufSizeH_; + unsigned int bufnum_; + bool srcImage_; + bool dstImage_; + unsigned int numIter; + void setData(void* ptr, unsigned int pitch, unsigned int size); + void checkData(void* ptr, unsigned int pitch, unsigned int size); +}; + +#endif // _OCL_ImageCopyCorners_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopySpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopySpeed.cpp new file mode 100644 index 0000000000..5d62de9dad --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopySpeed.cpp @@ -0,0 +1,344 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfImageCopySpeed.h" + +#include +#include +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 4 +static const unsigned int Sizes[NUM_SIZES] = {256, 512, 1024, 2048}; + +#define NUM_FORMATS 1 +static const cl_image_format formats[NUM_FORMATS] = { + {CL_RGBA, CL_UNSIGNED_INT8}}; +static const char *textFormats[NUM_FORMATS] = {"R8G8B8A8"}; +static const unsigned int formatSize[NUM_FORMATS] = {4 * sizeof(cl_uchar)}; + +static const unsigned int Iterations[2] = {1, OCLPerfImageCopySpeed::NUM_ITER}; + +#define NUM_SUBTESTS 3 +OCLPerfImageCopySpeed::OCLPerfImageCopySpeed() { + _numSubTests = NUM_SIZES * NUM_SUBTESTS * NUM_FORMATS * 2; +} + +OCLPerfImageCopySpeed::~OCLPerfImageCopySpeed() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfImageCopySpeed::setData(void *ptr, unsigned int pitch, + unsigned int size, unsigned int value) { + unsigned int *ptr2 = (unsigned int *)ptr; + for (unsigned int i = 0; i < size >> 2; i++) { + ptr2[i] = value; + } +} + +void OCLPerfImageCopySpeed::checkData(void *ptr, unsigned int pitch, + unsigned int size, unsigned int value) { + unsigned int *ptr2 = (unsigned int *)ptr; + for (unsigned int i = 0; i < size >> 2; i++) { + if (ptr2[i] != value) { + printf("Data validation failed at %d! Got 0x%08x 0x%08x 0x%08x 0x%08x\n", + i, ptr2[i], ptr2[i + 1], ptr2[i + 2], ptr2[i + 3]); + printf("Expected 0x%08x 0x%08x 0x%08x 0x%08x\n", value, value, value, + value); + break; + } + } +} + +void OCLPerfImageCopySpeed::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint typeOfDevice = type_; + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + srcBuffer_ = 0; + dstBuffer_ = 0; + srcImage_ = false; + dstImage_ = false; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice, + 0, NULL, &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + // if (num_devices > 0) + //{ + // platform = platforms[_platformIndex]; + // break; + //} +#if 0 + } +#endif + delete platforms; + } + + bufSize_ = Sizes[_openTest % NUM_SIZES]; + bufnum_ = (_openTest / (NUM_SIZES * NUM_SUBTESTS)) % NUM_FORMATS; + + if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) + 1) & 1) { + srcImage_ = true; + } + if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) + 1) & 2) { + dstImage_ = true; + } + + numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS * NUM_FORMATS)]; + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices, + devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_mem_flags flags = CL_MEM_WRITE_ONLY; + void *mem; + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {bufSize_, bufSize_, 1}; + size_t image_row_pitch; + size_t image_slice_pitch; + unsigned int memSize; + if (dstImage_) { + dstBuffer_ = + _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], bufSize_, + bufSize_, 0, NULL, &error_); + CHECK_RESULT(dstBuffer_ == 0, "clCreateImage(dstBuffer) failed"); + mem = _wrapper->clEnqueueMapImage( + cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region, + &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapImage failed"); + memSize = (unsigned int)image_row_pitch * bufSize_; + } else { + dstBuffer_ = _wrapper->clCreateBuffer( + context_, flags, bufSize_ * bufSize_ * formatSize[bufnum_], NULL, + &error_); + CHECK_RESULT(dstBuffer_ == 0, "clCreateBuffer(dstBuffer) failed"); + mem = _wrapper->clEnqueueMapBuffer( + cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_WRITE, 0, + bufSize_ * bufSize_ * formatSize[bufnum_], 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + memSize = (unsigned int)bufSize_ * bufSize_ * formatSize[bufnum_]; + image_row_pitch = 0; + } + setData(mem, (unsigned int)image_row_pitch, memSize, 0xdeadbeef); + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL); + + flags = CL_MEM_READ_ONLY; + if (srcImage_) { + srcBuffer_ = + _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], bufSize_, + bufSize_, 0, NULL, &error_); + CHECK_RESULT(srcBuffer_ == 0, "clCreateImage(srcBuffer) failed"); + mem = _wrapper->clEnqueueMapImage( + cmd_queue_, srcBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region, + &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapImage failed"); + memSize = (unsigned int)image_row_pitch * bufSize_; + } else { + srcBuffer_ = _wrapper->clCreateBuffer( + context_, flags, bufSize_ * bufSize_ * formatSize[bufnum_], NULL, + &error_); + CHECK_RESULT(srcBuffer_ == 0, "clCreateBuffer(srcBuffer) failed"); + mem = _wrapper->clEnqueueMapBuffer( + cmd_queue_, srcBuffer_, CL_TRUE, CL_MAP_WRITE, 0, + bufSize_ * bufSize_ * formatSize[bufnum_], 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + memSize = (unsigned int)bufSize_ * bufSize_ * formatSize[bufnum_]; + image_row_pitch = 0; + } + setData(mem, (unsigned int)image_row_pitch, memSize, 0x600df00d); + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, srcBuffer_, mem, 0, NULL, NULL); +} + +void OCLPerfImageCopySpeed::run(void) { + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {bufSize_, bufSize_, 1}; + + // Warm up + if (srcImage_ == false) { + error_ = _wrapper->clEnqueueCopyBufferToImage( + cmd_queue_, srcBuffer_, dstBuffer_, 0, origin, region, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyBufferToImage failed"); + } else if (dstImage_ == false) { + error_ = _wrapper->clEnqueueCopyImageToBuffer( + cmd_queue_, srcBuffer_, dstBuffer_, origin, region, 0, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyImageToBuffer failed"); + } else { + error_ = + _wrapper->clEnqueueCopyImage(cmd_queue_, srcBuffer_, dstBuffer_, origin, + origin, region, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyImage failed"); + } + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + + CPerfCounter timer; + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < numIter; i++) { + if (srcImage_ == false) { + error_ = _wrapper->clEnqueueCopyBufferToImage( + cmd_queue_, srcBuffer_, dstBuffer_, 0, origin, region, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyBufferToImage failed"); + } else if (dstImage_ == false) { + error_ = _wrapper->clEnqueueCopyImageToBuffer( + cmd_queue_, srcBuffer_, dstBuffer_, origin, region, 0, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyImageToBuffer failed"); + } else { + error_ = + _wrapper->clEnqueueCopyImage(cmd_queue_, srcBuffer_, dstBuffer_, + origin, origin, region, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyImage failed"); + } + } + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Image copy bandwidth in GB/s + double perf = ((double)bufSize_ * bufSize_ * formatSize[bufnum_] * 2 * + numIter * (double)(1e-09)) / + sec; + + const char *strSrc = NULL; + const char *strDst = NULL; + if (srcImage_) + strSrc = "img"; + else + strSrc = "buf"; + if (dstImage_) + strDst = "img"; + else + strDst = "buf"; + void *mem; + size_t image_row_pitch; + size_t image_slice_pitch; + unsigned int memSize; + if (dstImage_) { + mem = _wrapper->clEnqueueMapImage( + cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ, origin, region, + &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapImage failed"); + memSize = (unsigned int)image_row_pitch * bufSize_; + } else { + mem = _wrapper->clEnqueueMapBuffer( + cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ, 0, + bufSize_ * bufSize_ * formatSize[bufnum_], 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + memSize = (unsigned int)bufSize_ * bufSize_ * formatSize[bufnum_]; + image_row_pitch = 0; + } + checkData(mem, (unsigned int)image_row_pitch, memSize, 0x600df00d); + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL); + _perfInfo = (float)perf; + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s src:%s dst:%s i: %4d (GB/s) ", + bufSize_, bufSize_, textFormats[bufnum_], strSrc, strDst, numIter); + testDescString = buf; +} + +unsigned int OCLPerfImageCopySpeed::close(void) { + _wrapper->clFinish(cmd_queue_); + + if (srcBuffer_) { + error_ = _wrapper->clReleaseMemObject(srcBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(srcBuffer_) failed"); + } + if (dstBuffer_) { + error_ = _wrapper->clReleaseMemObject(dstBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(dstBuffer_) failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopySpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopySpeed.h new file mode 100644 index 0000000000..570ab9511e --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCopySpeed.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_ImageCopySpeed_H_ +#define _OCL_ImageCopySpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerfImageCopySpeed : public OCLTestImp { + public: + OCLPerfImageCopySpeed(); + virtual ~OCLPerfImageCopySpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 1000; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_mem srcBuffer_; + cl_mem dstBuffer_; + cl_int error_; + + unsigned int bufSize_; + unsigned int bufnum_; + bool srcImage_; + bool dstImage_; + unsigned int numIter; + void setData(void* ptr, unsigned int pitch, unsigned int size, + unsigned int value); + void checkData(void* ptr, unsigned int pitch, unsigned int size, + unsigned int value); +}; + +#endif // _OCL_ImageCopySpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCreate.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCreate.cpp new file mode 100644 index 0000000000..7502b65aa0 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCreate.cpp @@ -0,0 +1,194 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfImageCreate.h" + +#include +#include +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 4 +static const unsigned int Sizes[NUM_SIZES] = {256, 512, 1024, 2048}; + +#if defined(CL_VERSION_2_0) +#define NUM_FORMATS 3 +static const cl_image_format formats[NUM_FORMATS] = { + {CL_RGBA, CL_UNSIGNED_INT8}, + {CL_sRGBA, CL_UNORM_INT8}, + {CL_DEPTH, CL_UNORM_INT16}}; +static const char *textFormats[NUM_FORMATS] = {"CL_RGBA , CL_UNSIGNED_INT8", + "CL_sRGBA, CL_UNORM_INT8 ", + "CL_DEPTH, CL_UNORM_INT16 "}; +static const unsigned int formatSize[NUM_FORMATS] = { + sizeof(CL_UNSIGNED_INT8), sizeof(CL_UNORM_INT8), sizeof(CL_UNORM_INT16)}; +#else +#define NUM_FORMATS 1 +static const cl_image_format formats[NUM_FORMATS] = { + {CL_RGBA, CL_UNSIGNED_INT8}}; +static const char *textFormats[NUM_FORMATS] = {"CL_RGBA, CL_UNSIGNED_INT8"}; +static const unsigned int formatSize[NUM_FORMATS] = {sizeof(CL_UNSIGNED_INT8)}; +#endif + +OCLPerfImageCreate::OCLPerfImageCreate() { + _numSubTests = NUM_SIZES * NUM_FORMATS; +} + +OCLPerfImageCreate::~OCLPerfImageCreate() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfImageCreate::setData(void *ptr, unsigned int size, + unsigned int value) { + unsigned int *ptr2 = (unsigned int *)ptr; + for (unsigned int i = 0; i < size >> 2; i++) { + ptr2[i] = value; + value++; + } +} + +void OCLPerfImageCreate::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + error_ = CL_SUCCESS; + testId_ = test; + + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + program_ = 0; + kernel_ = 0; + cmd_queue_ = 0; + outBuffer_ = 0; + skip_ = false; + + // check device version + size_t param_size = 0; + char *strVersion = 0; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0, + 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strVersion = new char[param_size]; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, + param_size, strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (strVersion[7] < '2') { + skip_ = true; + testDescString = + "sRGBA Image not supported for < 2.0 devices. Test Skipped."; + delete strVersion; + return; + } + delete strVersion; + + bufSize_ = Sizes[test % NUM_SIZES]; + bufnum_ = (test / NUM_SIZES) % NUM_FORMATS; + memSize = bufSize_ * bufSize_ * formatSize[bufnum_]; + numIter = 100; + + outBuffer_ = (cl_mem *)malloc(numIter * sizeof(cl_mem)); + memptr = new char[memSize]; + + cmd_queue_ = cmdQueues_[_deviceId]; +} + +void OCLPerfImageCreate::run(void) { + if (skip_) { + return; + } + + CPerfCounter timer; + + cl_image_desc imageInfo; + + memset(&imageInfo, 0x0, sizeof(cl_image_desc)); + + imageInfo.image_type = CL_MEM_OBJECT_IMAGE2D; + imageInfo.image_width = bufSize_; + imageInfo.image_height = bufSize_; + imageInfo.image_depth = 1; + imageInfo.image_array_size = 1; + imageInfo.image_row_pitch = bufSize_ * formatSize[bufnum_]; + imageInfo.image_slice_pitch = imageInfo.image_row_pitch * (bufSize_); + + setData(memptr, memSize, 0xdeadbeef); + + char *dstmem = new char[memSize]; + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {1, 1, 1}; + + timer.Reset(); + timer.Start(); + + for (unsigned int i = 0; i < numIter; ++i) { + outBuffer_[i] = + clCreateImage(context_, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, + &formats[bufnum_], &imageInfo, memptr, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "Error clCreateImage()"); + + error_ = + _wrapper->clEnqueueReadImage(cmd_queue_, outBuffer_[i], CL_TRUE, origin, + region, 0, 0, dstmem, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueReadImage failed"); + _wrapper->clFinish(cmd_queue_); + } + + timer.Stop(); + + delete dstmem; + + double sec = timer.GetElapsedTime(); + + // Image create in GB/s + double perf = ((double)memSize * numIter * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; + char buf[256]; + unsigned int fmt_num = (testId_ / NUM_SIZES) % NUM_FORMATS; + SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s(%1d) i: %4d (GB/s) ", bufSize_, + bufSize_, textFormats[fmt_num], formatSize[bufnum_], numIter); + testDescString = buf; +} + +unsigned int OCLPerfImageCreate::close(void) { + if (memptr) { + delete memptr; + } + if (outBuffer_) { + for (unsigned int i = 0; i < numIter; ++i) { + if (outBuffer_[i]) { + error_ = _wrapper->clReleaseMemObject(outBuffer_[i]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_[i]) failed"); + } + } + } + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCreate.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCreate.h new file mode 100644 index 0000000000..5d717a5d12 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageCreate.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_ImageCreate_H_ +#define _OCL_ImageCreate_H_ + +#include "OCLTestImp.h" + +class OCLPerfImageCreate : public OCLTestImp { + public: + OCLPerfImageCreate(); + virtual ~OCLPerfImageCreate(); + + public: + virtual void open(unsigned int test, char *units, double &conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + virtual void setData(void *ptr, unsigned int size, unsigned int value); + + cl_command_queue cmd_queue_; + cl_mem *outBuffer_; + + unsigned int bufSize_; + unsigned int bufnum_; + unsigned int numIter; + char *memptr; + unsigned int memSize; + unsigned int testId_; + + bool skip_; +}; + +#endif // _OCL_ImageCreate_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageMapUnmap.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageMapUnmap.cpp new file mode 100644 index 0000000000..926d5b3f65 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageMapUnmap.cpp @@ -0,0 +1,333 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfImageMapUnmap.h" + +#include +#include +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 1 +static const unsigned int Sizes0[2] = {0xc0, 0x18a}; + +#define NUM_FORMATS 1 +static const cl_image_format formats[NUM_FORMATS] = {{CL_R, CL_SNORM_INT16}}; +static const char *textFormats[NUM_FORMATS] = {"R16"}; +static const unsigned int formatSize[NUM_FORMATS] = {2 * sizeof(cl_uchar)}; + +static const unsigned int Iterations[2] = {1, OCLPerfImageMapUnmap::NUM_ITER}; + +#define NUM_SUBTESTS 1 +OCLPerfImageMapUnmap::OCLPerfImageMapUnmap() { + _numSubTests = NUM_SIZES * NUM_SUBTESTS * NUM_FORMATS * 1; +} + +OCLPerfImageMapUnmap::~OCLPerfImageMapUnmap() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfImageMapUnmap::setData(void *ptr, unsigned int pitch, + unsigned int size, unsigned int value) { + unsigned int *ptr2 = (unsigned int *)ptr; + value = 0; + for (unsigned int i = 0; i < size >> 2; i++) { + ptr2[i] = value; + value++; + } +} + +void OCLPerfImageMapUnmap::checkData(void *ptr, unsigned int pitch, + unsigned int size, unsigned int value) { + unsigned int *ptr2 = (unsigned int *)ptr; + value = 0; + for (unsigned int i = 0; i < size >> 2; i++) { + if (ptr2[i] != value) { + printf("Data validation failed at %d! Got 0x%08x 0x%08x 0x%08x 0x%08x\n", + i, ptr2[i], ptr2[i + 1], ptr2[i + 2], ptr2[i + 3]); + printf("Expected 0x%08x 0x%08x 0x%08x 0x%08x\n", value, value, value, + value); + CHECK_RESULT(true, "Data validation failed!"); + break; + } + value++; + } +} + +void OCLPerfImageMapUnmap::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint typeOfDevice = type_; + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + size_t queryOut = 0; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + srcBuffer_ = 0; + dstBuffer_ = 0; + srcImage_ = false; + dstImage_ = false; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice, + 0, NULL, &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + // if (num_devices > 0) + //{ + // platform = platforms[_platformIndex]; + // break; + //} +#if 0 + } +#endif + delete platforms; + } + + bufnum_ = (_openTest / (NUM_SIZES * NUM_SUBTESTS)) % NUM_FORMATS; + + srcImage_ = true; + + dstImage_ = false; + + numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS * NUM_FORMATS)]; + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices, + devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + bufSizeW_ = Sizes0[0]; + bufSizeH_ = Sizes0[1]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_mem_flags flags = CL_MEM_WRITE_ONLY; + cl_mem_flags flags2 = CL_MEM_WRITE_ONLY; + void *mem; + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {bufSizeW_, bufSizeH_, 1}; + size_t image_row_pitch; + size_t image_slice_pitch; + cl_image_desc imageInfo; + + memset(&imageInfo, 0x0, sizeof(cl_image_desc)); + + imageInfo.image_type = CL_MEM_OBJECT_IMAGE2D; + imageInfo.image_width = bufSizeW_; + imageInfo.image_height = bufSizeH_; + imageInfo.image_depth = 1; + imageInfo.image_array_size = 1; + imageInfo.image_row_pitch = bufSizeW_ * formatSize[bufnum_]; + imageInfo.image_slice_pitch = imageInfo.image_row_pitch * (bufSizeH_); + + void *host_ptr = malloc(imageInfo.image_row_pitch * imageInfo.image_height); + + unsigned int memSize; + if (dstImage_) { + dstBuffer_ = + _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], bufSizeW_, + bufSizeH_, 0, host_ptr, &error_); + CHECK_RESULT(dstBuffer_ == 0, "clCreateImage(dstBuffer) failed"); + mem = _wrapper->clEnqueueMapImage( + cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region, + &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapImage failed"); + memSize = (unsigned int)image_row_pitch * bufSizeH_; + } else { + dstBuffer_ = _wrapper->clCreateBuffer( + context_, flags2, bufSizeW_ * bufSizeH_ * formatSize[bufnum_], NULL, + &error_); + CHECK_RESULT(dstBuffer_ == 0, "clCreateBuffer(dstBuffer) failed"); + mem = _wrapper->clEnqueueMapBuffer( + cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_WRITE, 0, + bufSizeW_ * bufSizeH_ * formatSize[bufnum_], 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + memSize = (unsigned int)bufSizeW_ * bufSizeH_ * formatSize[bufnum_]; + image_row_pitch = 0; + } + setData(mem, (unsigned int)image_row_pitch, memSize, 0xdeadbeef); + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL); + + flags = CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR; + if (srcImage_) { + srcBuffer_ = _wrapper->clCreateImage(context_, flags, &formats[bufnum_], + &imageInfo, host_ptr, &error_); + CHECK_RESULT(srcBuffer_ == 0, "clCreateImage(srcBuffer) failed"); + mem = _wrapper->clEnqueueMapImage( + cmd_queue_, srcBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region, + &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapImage failed"); + memSize = (unsigned int)image_row_pitch * bufSizeH_; + error_ = _wrapper->clFinish(cmd_queue_); + } else { + srcBuffer_ = _wrapper->clCreateBuffer( + context_, flags, bufSizeW_ * bufSizeH_ * formatSize[bufnum_], NULL, + &error_); + CHECK_RESULT(srcBuffer_ == 0, "clCreateBuffer(srcBuffer) failed"); + mem = _wrapper->clEnqueueMapBuffer( + cmd_queue_, srcBuffer_, CL_TRUE, CL_MAP_WRITE, 0, + bufSizeW_ * bufSizeH_ * formatSize[bufnum_], 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + memSize = (unsigned int)bufSizeW_ * bufSizeH_ * formatSize[bufnum_]; + image_row_pitch = 0; + } + setData(mem, (unsigned int)image_row_pitch, memSize, 0x600df00d); + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, srcBuffer_, mem, 0, NULL, NULL); + error_ = _wrapper->clFinish(cmd_queue_); +} + +void OCLPerfImageMapUnmap::run(void) { + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {bufSizeW_, bufSizeH_, 1}; + + if (srcImage_ == false) { + error_ = _wrapper->clEnqueueCopyBufferToImage( + cmd_queue_, srcBuffer_, dstBuffer_, 0, origin, region, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyBufferToImage failed"); + } else if (dstImage_ == false) { + error_ = _wrapper->clEnqueueCopyImageToBuffer( + cmd_queue_, srcBuffer_, dstBuffer_, origin, region, 0, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyImageToBuffer failed"); + } else { + error_ = + _wrapper->clEnqueueCopyImage(cmd_queue_, srcBuffer_, dstBuffer_, origin, + origin, region, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyImage failed"); + } + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + + const char *strSrc = NULL; + const char *strDst = NULL; + if (srcImage_) + strSrc = "img"; + else + strSrc = "buf"; + if (dstImage_) + strDst = "img"; + else + strDst = "buf"; + void *mem; + size_t image_row_pitch; + size_t image_slice_pitch; + unsigned int memSize; + if (dstImage_) { + mem = _wrapper->clEnqueueMapImage( + cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ, origin, region, + &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapImage failed"); + memSize = (unsigned int)image_row_pitch * bufSizeH_; + } else { + mem = _wrapper->clEnqueueMapBuffer( + cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ, 0, + bufSizeW_ * bufSizeH_ * formatSize[bufnum_], 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + memSize = (unsigned int)bufSizeW_ * bufSizeH_ * formatSize[bufnum_]; + image_row_pitch = 0; + } + checkData(mem, (unsigned int)image_row_pitch, memSize, 0x600df00d); + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL); + _perfInfo = 0; + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s src:%s dst:%s i: %4d (GB/s) ", + bufSizeW_, bufSizeH_, textFormats[bufnum_], strSrc, strDst, numIter); + testDescString = buf; +} + +unsigned int OCLPerfImageMapUnmap::close(void) { + _wrapper->clFinish(cmd_queue_); + + if (srcBuffer_) { + error_ = _wrapper->clReleaseMemObject(srcBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(srcBuffer_) failed"); + } + if (dstBuffer_) { + error_ = _wrapper->clReleaseMemObject(dstBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(dstBuffer_) failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageMapUnmap.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageMapUnmap.h new file mode 100644 index 0000000000..9f061581de --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageMapUnmap.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_ImageMapUnmap_H_ +#define _OCL_ImageMapUnmap_H_ + +#include "OCLTestImp.h" + +class OCLPerfImageMapUnmap : public OCLTestImp { + public: + OCLPerfImageMapUnmap(); + virtual ~OCLPerfImageMapUnmap(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 1; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_mem srcBuffer_; + cl_mem dstBuffer_; + cl_int error_; + + unsigned int bufSizeW_; + unsigned int bufSizeH_; + unsigned int bufnum_; + bool srcImage_; + bool dstImage_; + unsigned int numIter; + void setData(void* ptr, unsigned int pitch, unsigned int size, + unsigned int value); + void checkData(void* ptr, unsigned int pitch, unsigned int size, + unsigned int value); +}; + +#endif // _OCL_ImageMapUnmap_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadSpeed.cpp new file mode 100644 index 0000000000..7f87c24515 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadSpeed.cpp @@ -0,0 +1,295 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfImageReadSpeed.h" + +#include +#include +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 4 +static const unsigned int Sizes[NUM_SIZES] = {256, 512, 1024, 2048}; + +#define NUM_FORMATS 1 +static const cl_image_format formats[NUM_FORMATS] = { + {CL_RGBA, CL_UNSIGNED_INT8}}; +static const char *textFormats[NUM_FORMATS] = {"R8G8B8A8"}; +static const unsigned int formatSize[NUM_FORMATS] = {4}; + +static const unsigned int Iterations[2] = {1, OCLPerfImageReadSpeed::NUM_ITER}; + +OCLPerfImageReadSpeed::OCLPerfImageReadSpeed() { + _numSubTests = NUM_SIZES * NUM_FORMATS * 2; +} + +OCLPerfImageReadSpeed::~OCLPerfImageReadSpeed() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfImageReadSpeed::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint typeOfDevice = type_; + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + outBuffer_ = 0; + memptr = NULL; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice, + 0, NULL, &num_devices); + delete platforms; + } + + bufSize_ = Sizes[_openTest % NUM_SIZES]; + bufnum_ = (_openTest / NUM_SIZES) % NUM_FORMATS; + numIter = Iterations[_openTest / (NUM_SIZES * NUM_FORMATS)]; + + CHECK_RESULT(platform == 0, "Couldn't find platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices, + devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_mem_flags flags = CL_MEM_WRITE_ONLY; + outBuffer_ = _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], + bufSize_, bufSize_, 0, NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateImage(outBuffer) failed"); + memptr = new char[bufSize_ * bufSize_ * formatSize[bufnum_]]; +} + +void OCLPerfImageReadSpeed::run(void) { + CPerfCounter timer; + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {bufSize_, bufSize_, 1}; + // Warm up + error_ = _wrapper->clEnqueueReadImage(cmd_queue_, outBuffer_, CL_TRUE, origin, + region, 0, 0, memptr, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueReadImage failed"); + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < numIter; i++) { + error_ = + _wrapper->clEnqueueReadImage(cmd_queue_, outBuffer_, CL_TRUE, origin, + region, 0, 0, memptr, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueReadImage failed"); + } + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Image read bandwidth in GB/s + double perf = ((double)bufSize_ * bufSize_ * formatSize[bufnum_] * numIter * + (double)(1e-09)) / + sec; + + _perfInfo = (float)perf; + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s i: %4d (GB/s) ", bufSize_, + bufSize_, textFormats[bufnum_], numIter); + testDescString = buf; +} + +unsigned int OCLPerfImageReadSpeed::close(void) { + if (memptr) { + delete memptr; + } + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} + +OCLPerfPinnedImageReadSpeed::OCLPerfPinnedImageReadSpeed() { + _numSubTests = NUM_SIZES * NUM_FORMATS * 2; +} + +OCLPerfPinnedImageReadSpeed::~OCLPerfPinnedImageReadSpeed() {} + +void OCLPerfPinnedImageReadSpeed::open(unsigned int test, char *units, + double &conversion, + unsigned int deviceId) { + cl_uint typeOfDevice = type_; + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + outBuffer_ = 0; + memptr = NULL; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice, + 0, NULL, &num_devices); + delete platforms; + } + + bufSize_ = Sizes[_openTest % NUM_SIZES]; + bufnum_ = (_openTest / NUM_SIZES) % NUM_FORMATS; + numIter = Iterations[_openTest / (NUM_SIZES * NUM_FORMATS)]; + + CHECK_RESULT(platform == 0, "Couldn't find platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices, + devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_mem_flags flags = CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR; + inBuffer_ = _wrapper->clCreateBuffer( + context_, flags, bufSize_ * bufSize_ * formatSize[bufnum_], NULL, + &error_); + CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed"); + + flags = CL_MEM_WRITE_ONLY; + outBuffer_ = _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], + bufSize_, bufSize_, 0, NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateImage(outBuffer) failed"); + + memptr = (char *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, inBuffer_, CL_TRUE, CL_MAP_WRITE, 0, + bufSize_ * bufSize_ * formatSize[bufnum_], 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); +} + +unsigned int OCLPerfPinnedImageReadSpeed::close(void) { + if (memptr) { + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, inBuffer_, memptr, 0, + NULL, NULL); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clEnqueueUnmapMemObject(inBuffer_) failed"); + clFinish(cmd_queue_); + } + if (inBuffer_) { + error_ = _wrapper->clReleaseMemObject(inBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadSpeed.h new file mode 100644 index 0000000000..e1d8498610 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadSpeed.h @@ -0,0 +1,61 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_ImageReadSpeed_H_ +#define _OCL_ImageReadSpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerfImageReadSpeed : public OCLTestImp { + public: + OCLPerfImageReadSpeed(); + virtual ~OCLPerfImageReadSpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 100; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int bufSize_; + unsigned int bufnum_; + unsigned int numIter; + char* memptr; +}; + +class OCLPerfPinnedImageReadSpeed : public OCLPerfImageReadSpeed { + public: + OCLPerfPinnedImageReadSpeed(); + virtual ~OCLPerfPinnedImageReadSpeed(); + + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual unsigned int close(void); + + cl_mem inBuffer_; +}; +#endif // _OCL_ImageReadSpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadWrite.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadWrite.cpp new file mode 100644 index 0000000000..3a668554f7 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadWrite.cpp @@ -0,0 +1,223 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfImageReadWrite.h" + +#include +#include +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define KERNEL_CODE(...) #__VA_ARGS__ + +#define NUM_SIZES 4 +static const unsigned int Sizes[NUM_SIZES] = {256, 512, 1024, 2048}; + +#if defined(CL_VERSION_2_0) +#define NUM_FORMATS 2 +static const cl_image_format formats[NUM_FORMATS] = { + {CL_RGBA, CL_UNSIGNED_INT8}, {CL_sRGBA, CL_UNORM_INT8}}; +static const char *textFormats[NUM_FORMATS] = {"CL_RGBA , CL_UNSIGNED_INT8", + "CL_sRGBA, CL_UNORM_INT8 "}; +static const unsigned int formatSize[NUM_FORMATS] = {sizeof(CL_UNSIGNED_INT8), + sizeof(CL_UNORM_INT8)}; +#else +#define NUM_FORMATS 1 +static const cl_image_format formats[NUM_FORMATS] = { + {CL_RGBA, CL_UNSIGNED_INT8}}; +static const char *textFormats[NUM_FORMATS] = {"CL_RGBA , CL_UNSIGNED_INT8"}; +static const unsigned int formatSize[NUM_FORMATS] = {sizeof(CL_UNSIGNED_INT8)}; +#endif + +const static char *strKernel = {KERNEL_CODE( + \n __constant sampler_t s_nearest = CLK_FILTER_NEAREST | CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE; + \n __kernel void image_kernel(read_write image2d_t image, uint zero) { + int x = get_global_id(0); + int y = get_global_id(1); + + int offset = y * get_image_width(image) + x; + + int2 coords = (int2)(x, y); + uint4 tmp = read_imageui(image, s_nearest, coords); + + write_imageui(image, coords, 1 + tmp * zero); +} + \n)}; + +OCLPerfImageReadWrite::OCLPerfImageReadWrite() { + _numSubTests = NUM_SIZES * NUM_FORMATS; +} + +OCLPerfImageReadWrite::~OCLPerfImageReadWrite() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfImageReadWrite::setData(void *ptr, unsigned int size, + unsigned int value) { + unsigned int *ptr2 = (unsigned int *)ptr; + for (unsigned int i = 0; i < size >> 2; i++) { + ptr2[i] = value; + value++; + } +} + +void OCLPerfImageReadWrite::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + error_ = CL_SUCCESS; + testId_ = test; + + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + program_ = 0; + kernel_ = 0; + cmd_queue_ = 0; + imageBuffer_ = 0; + skip_ = false; + + // check device version + size_t param_size = 0; + char *strVersion = 0; + error_ = _wrapper->clGetDeviceInfo( + devices_[_deviceId], CL_DEVICE_OPENCL_C_VERSION, 0, 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strVersion = new char[param_size]; + error_ = + _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_OPENCL_C_VERSION, + param_size, strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (strVersion[9] < '2') { + skip_ = true; + testDescString = + "Image read_write qualifier not supported in OpenCL C < 2.0. Test " + "Skipped."; + delete strVersion; + return; + } + delete strVersion; + + bufSize_ = Sizes[test % NUM_SIZES]; + bufnum_ = (test / NUM_SIZES) % NUM_FORMATS; + memSize = bufSize_ * bufSize_ * formatSize[bufnum_]; + numIter = 100; + + memptr = new char[memSize]; + + cmd_queue_ = cmdQueues_[_deviceId]; + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "image_kernel", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + // create image + setData(memptr, memSize, 0x0); + imageBuffer_ = _wrapper->clCreateImage2D( + context_, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, &formats[bufnum_], + bufSize_, bufSize_, 0, memptr, &error_); + CHECK_RESULT(error_ != CL_SUCCESS, "clCreateImage2D() failed"); + + const unsigned int zero = 0; + + // set kernel arguments + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &imageBuffer_); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(unsigned int), &zero); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); +} + +void OCLPerfImageReadWrite::run(void) { + if (skip_) { + return; + } + + CPerfCounter timer; + + size_t gws[2] = {bufSize_, bufSize_}; + size_t lws[2] = {8, 8}; + + error_ = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, kernel_, 2, NULL, gws, + lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + _wrapper->clFinish(cmd_queue_); + + timer.Reset(); + timer.Start(); + + for (unsigned int i = 0; i < numIter; ++i) { + error_ = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, kernel_, 2, NULL, gws, + lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + _wrapper->clFinish(cmd_queue_); + } + + timer.Stop(); + + double sec = timer.GetElapsedTime(); + + // speed in GB/s + double perf = ((double)memSize * numIter * (double)(1e-09)) * 2 / sec; + + _perfInfo = (float)perf; + char buf[256]; + unsigned int fmt_num = (testId_ / NUM_SIZES) % NUM_FORMATS; + SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s(%1d) i: %4d (GB/s) ", bufSize_, + bufSize_, textFormats[fmt_num], formatSize[bufnum_], numIter); + testDescString = buf; +} + +unsigned int OCLPerfImageReadWrite::close(void) { + if (!skip_) { + if (memptr) { + delete memptr; + } + if (imageBuffer_) { + error_ = _wrapper->clReleaseMemObject(imageBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(imageBuffer_) failed"); + } + } + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadWrite.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadWrite.h new file mode 100644 index 0000000000..327786527c --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadWrite.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_ImageReadWrite +#define _OCL_ImageReadWrite + +#include "OCLTestImp.h" + +class OCLPerfImageReadWrite : public OCLTestImp { + public: + OCLPerfImageReadWrite(); + virtual ~OCLPerfImageReadWrite(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + virtual void setData(void* ptr, unsigned int size, unsigned int value); + + cl_command_queue cmd_queue_; + cl_mem imageBuffer_; + + unsigned int bufSize_; + unsigned int bufnum_; + unsigned int numIter; + char* memptr; + unsigned int memSize; + unsigned int testId_; + + bool skip_; +}; + +#endif // _OCL_ImageReadWrite diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadsRGBA.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadsRGBA.cpp new file mode 100644 index 0000000000..5ad33bc14c --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadsRGBA.cpp @@ -0,0 +1,236 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfImageReadsRGBA.h" + +#include +#include +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define KERNEL_CODE(...) #__VA_ARGS__ + +#define NUM_SIZES 4 +static const unsigned int Sizes[NUM_SIZES] = {256, 512, 1024, 2048}; + +#if defined(CL_VERSION_2_0) +#define NUM_FORMATS 2 +static const cl_image_format formats[NUM_FORMATS] = { + {CL_RGBA, CL_UNSIGNED_INT8}, {CL_sRGBA, CL_UNORM_INT8}}; +static const char *textFormats[NUM_FORMATS] = {"CL_RGBA , CL_UNSIGNED_INT8", + "CL_sRGBA, CL_UNORM_INT8 "}; +static const unsigned int formatSize[NUM_FORMATS] = {sizeof(CL_UNSIGNED_INT8), + sizeof(CL_UNORM_INT8)}; +#else +#define NUM_FORMATS 1 +static const cl_image_format formats[NUM_FORMATS] = { + {CL_RGBA, CL_UNSIGNED_INT8}}; +static const char *textFormats[NUM_FORMATS] = {"CL_RGBA , CL_UNSIGNED_INT8"}; +static const unsigned int formatSize[NUM_FORMATS] = {sizeof(CL_UNSIGNED_INT8)}; +#endif + +const static char *strKernel = {KERNEL_CODE( + \n __constant sampler_t s_nearest = CLK_FILTER_NEAREST | CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE; + \n + // Read sRGBA image object (input) and convert it to linear RGB values + // (results): + __kernel void image_kernel(read_only image2d_t input, + __global float4 *results) { + int x = get_global_id(0); + int y = get_global_id(1); + + int offset = y * get_image_width(input) + x; + + int2 coords = (int2)(x, y); + float4 tmp = read_imagef(input, s_nearest, coords); + if (x < 0 && tmp.x == 0.f) { + results[offset] = tmp; + } + } + \n)}; + +OCLPerfImageReadsRGBA::OCLPerfImageReadsRGBA() { + _numSubTests = NUM_SIZES * NUM_FORMATS; +} + +OCLPerfImageReadsRGBA::~OCLPerfImageReadsRGBA() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfImageReadsRGBA::setData(void *ptr, unsigned int size, float value) { + unsigned int *ptr_i = (unsigned int *)ptr; + for (unsigned int i = 0; i < size >> 2; i++) { + ptr_i[i] = (int)value; + value++; + } +} + +void OCLPerfImageReadsRGBA::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + error_ = CL_SUCCESS; + testId_ = test; + + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + program_ = 0; + kernel_ = 0; + cmd_queue_ = 0; + imageBuffer_ = 0; + valueBuffer_ = 0; + skip_ = false; + + // check device version + size_t param_size = 0; + char *strVersion = 0; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0, + 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strVersion = new char[param_size]; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, + param_size, strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (strVersion[7] < '2') { + skip_ = true; + testDescString = + "sRGBA Image not supported for < 2.0 devices. Test Skipped."; + delete strVersion; + return; + } + delete strVersion; + + bufSize_ = Sizes[test % NUM_SIZES]; + bufnum_ = (test / NUM_SIZES) % NUM_FORMATS; + memSize = bufSize_ * bufSize_ * formatSize[bufnum_]; + numIter = 100; + + memptr = new char[memSize]; + + cmd_queue_ = cmdQueues_[_deviceId]; + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "image_kernel", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + setData(memptr, memSize, 0.f); + + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {bufSize_, bufSize_, 1}; + + // create image + imageBuffer_ = _wrapper->clCreateImage2D( + context_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &formats[bufnum_], + bufSize_, bufSize_, 0, memptr, &error_); + CHECK_RESULT(imageBuffer_ == 0, "clCreateImage2D(imageBuffer_) failed"); + + valueBuffer_ = clCreateBuffer( + context_, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, memSize, 0, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "Error clCreateBuffer()"); + + // set kernel arguments + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &imageBuffer_); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), &valueBuffer_); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); +} + +void OCLPerfImageReadsRGBA::run(void) { + if (skip_) { + return; + } + + CPerfCounter timer; + + size_t gws[2] = {bufSize_, bufSize_}; + size_t lws[2] = {8, 8}; + + // warm-up + error_ = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, kernel_, 2, NULL, gws, + lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + _wrapper->clFinish(cmd_queue_); + + timer.Reset(); + timer.Start(); + + for (unsigned int i = 0; i < numIter; ++i) { + error_ = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, kernel_, 2, NULL, gws, + lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + _wrapper->clFinish(cmd_queue_); + } + + timer.Stop(); + + double sec = timer.GetElapsedTime(); + + // read_imagef from sRGB to linear RGB speed in GB/s + double perf = ((double)memSize * numIter * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; + char buf[256]; + unsigned int fmt_num = (testId_ / NUM_SIZES) % NUM_FORMATS; + SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s(%1d) i: %4d (GB/s) ", bufSize_, + bufSize_, textFormats[fmt_num], formatSize[bufnum_], numIter); + testDescString = buf; +} + +unsigned int OCLPerfImageReadsRGBA::close(void) { + if (memptr) { + delete memptr; + } + if (imageBuffer_) { + error_ = _wrapper->clReleaseMemObject(imageBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(imageBuffer_) failed"); + } + if (valueBuffer_) { + error_ = _wrapper->clReleaseMemObject(valueBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(valueBuffer_) failed"); + } + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadsRGBA.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadsRGBA.h new file mode 100644 index 0000000000..60f0ad6b79 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageReadsRGBA.h @@ -0,0 +1,52 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_ImageReadsRGBA_H_ +#define _OCL_ImageReadsRGBA_H_ + +#include "OCLTestImp.h" + +class OCLPerfImageReadsRGBA : public OCLTestImp { + public: + OCLPerfImageReadsRGBA(); + virtual ~OCLPerfImageReadsRGBA(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + virtual void setData(void* ptr, unsigned int size, float value); + + cl_command_queue cmd_queue_; + cl_mem imageBuffer_; + cl_mem valueBuffer_; + + unsigned int bufSize_; + unsigned int bufnum_; + unsigned int numIter; + char* memptr; + unsigned int memSize; + unsigned int testId_; + + bool skip_; +}; + +#endif // _OCL_ImageReadsRGBA_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageSampleRate.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageSampleRate.cpp new file mode 100644 index 0000000000..f2a9933c78 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageSampleRate.cpp @@ -0,0 +1,324 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfImageSampleRate.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_TYPES 6 +static const cl_image_format formats[NUM_TYPES] = { + {CL_R, CL_UNSIGNED_INT8}, {CL_RG, CL_UNSIGNED_INT8}, + {CL_RGBA, CL_UNSIGNED_INT8}, {CL_R, CL_FLOAT}, + {CL_RGBA, CL_HALF_FLOAT}, {CL_RGBA, CL_FLOAT}}; +static const char *types[NUM_TYPES] = { + "R8", "R8G8", "R8G8B8A8", "R32F", "R16G16B16A16F", "R32G32B32A32F"}; +static const unsigned int typeSizes[NUM_TYPES] = {1, 2, 4, 4, 8, 16}; + +#define NUM_SIZES 12 +static const unsigned int sizes[NUM_SIZES] = {1, 2, 4, 8, 16, 32, + 64, 128, 256, 512, 1024, 2048}; + +#define NUM_BUFS 6 +#define MAX_BUFS (1 << (NUM_BUFS - 1)) + +OCLPerfImageSampleRate::OCLPerfImageSampleRate() { + _numSubTests = NUM_TYPES * NUM_SIZES * NUM_BUFS; +} + +OCLPerfImageSampleRate::~OCLPerfImageSampleRate() {} + +void OCLPerfImageSampleRate::setKernel(void) { + shader_.clear(); + shader_ += + "kernel void sampleRate(global float4* outBuffer, unsigned int " + "inBufSize, unsigned int writeIt,\n"; + char buf[256]; + for (unsigned int i = 0; i < numBufs_; i++) { + SNPRINTF(buf, sizeof(buf), "read_only image2d_t inBuffer%d", i); + shader_ += buf; + if (i < (numBufs_ - 1)) { + shader_ += ","; + } + shader_ += "\n"; + } + shader_ += ")\n"; + shader_ += + "{\n" + " uint gid = get_global_id(0);\n" + " uint inputIdx = gid % inBufSize;\n" + " const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | " + "CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n" + " float4 tmp = (float4)0.0f;\n"; + + for (unsigned int i = 0; i < numBufs_; i++) { + SNPRINTF(buf, sizeof(buf), + " tmp += read_imagef(inBuffer%d, sampler, (int2)( gid %% " + "inBufSize, (gid / inBufSize) %% inBufSize));\n", + i); + shader_ += buf; + } + shader_ += + " if (writeIt*(unsigned int)tmp.x) outBuffer[gid] = tmp;\n" + "}\n"; + // printf("Shader -> %s\n", shader_.c_str()); +} + +void OCLPerfImageSampleRate::setData(cl_mem buffer, unsigned int val) { + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {width_, width_, 1}; + size_t image_row_pitch; + size_t image_slice_pitch; + unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapImage( + cmd_queue_, buffer, true, CL_MAP_WRITE, origin, region, &image_row_pitch, + &image_slice_pitch, 0, NULL, NULL, &error_); + for (unsigned int i = 0; i < width_ * width_; i++) data[i] = val; + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); +} + +void OCLPerfImageSampleRate::checkData(cl_mem buffer) { +#if 0 + float* data = (float *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_READ, 0, outBufSize_, 0, NULL, NULL, &error_); + for (unsigned int i = 0; i < outBufSize_/sizeof(float); i++) + { + if (data[i] != (float)numBufs_) { + printf("Data validation failed at %d! Got %f, expected %f\n", i, data[i], (float)numBufs_); + break; + } + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, NULL); +#endif +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfImageSampleRate::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + program_ = 0; + kernel_ = 0; + inBuffer_ = 0; + outBuffer_ = 0; + + // We compute a square domain + width_ = sizes[test % NUM_SIZES]; + numBufs_ = (1 << ((test / NUM_SIZES) % NUM_BUFS)); + typeIdx_ = (test / (NUM_SIZES * NUM_BUFS)) % NUM_TYPES; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + platform = platforms[_platformIndex]; + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + delete platforms; + } + /* + * If we could find a platform, use it. + */ + CHECK_RESULT(platform == 0, + "Couldn't find platform with GPU devices, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + char charbuf[1024]; + size_t retsize; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024, + charbuf, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + inBuffer_ = (cl_mem *)malloc(sizeof(cl_mem) * numBufs_); + memset(inBuffer_, 0, sizeof(cl_mem) * numBufs_); + for (unsigned int i = 0; i < numBufs_; i++) { + inBuffer_[i] = _wrapper->clCreateImage2D(context_, CL_MEM_READ_ONLY, + &formats[typeIdx_], width_, width_, + 0, NULL, &error_); + CHECK_RESULT(inBuffer_[i] == 0, "clCreateImage2D(inBuffer) failed"); + } + + outBufSize_ = sizes[NUM_SIZES - 1] * sizes[NUM_SIZES - 1] * sizeof(cl_float4); + outBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, + outBufSize_, NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + setKernel(); + char *tmp = (char *)shader_.c_str(); + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&tmp, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + const char *buildOps = NULL; + error_ = _wrapper->clBuildProgram(program_, 1, &device, buildOps, NULL, NULL); + + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "sampleRate", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer_); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(outBuffer) failed"); + unsigned int sizeDW = width_; + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(unsigned int), + (void *)&sizeDW); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(sizeDW) failed"); + unsigned int writeIt = 0; + error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(unsigned int), + (void *)&writeIt); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(writeIt) failed"); + for (unsigned int i = 0; i < numBufs_; i++) { + error_ = _wrapper->clSetKernelArg(kernel_, i + 3, sizeof(cl_mem), + (void *)&inBuffer_[i]); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(inBuffer) failed"); + // setData(inBuffer_[i], 0x3f800000); + } + // setData(outBuffer_, 0xdeadbeef); +} + +void OCLPerfImageSampleRate::run(void) { + int global = outBufSize_ / typeSizes[typeIdx_]; + int local = 64; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + unsigned int maxIter = MAX_ITERATIONS * (MAX_BUFS / numBufs_); + + CPerfCounter timer; + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < maxIter; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + } + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + _wrapper->clFinish(cmd_queue_); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // checkData(outBuffer_); + // Compute GB/s + double perf = + ((double)outBufSize_ * numBufs_ * (double)maxIter * (double)(1e-09)) / + sec; + char buf[256]; + SNPRINTF(buf, sizeof(buf), "Domain %dx%d, %13s, %2d images,%4dx%4d (GB/s)", + sizes[NUM_SIZES - 1], sizes[NUM_SIZES - 1], types[typeIdx_], + numBufs_, width_, width_); + + _perfInfo = (float)perf; + testDescString = buf; +} + +unsigned int OCLPerfImageSampleRate::close(void) { + _wrapper->clFinish(cmd_queue_); + + if (inBuffer_) { + for (unsigned int i = 0; i < numBufs_; i++) { + if (inBuffer_[i]) { + error_ = _wrapper->clReleaseMemObject(inBuffer_[i]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(inBuffer_) failed"); + } + } + free(inBuffer_); + } + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageSampleRate.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageSampleRate.h new file mode 100644 index 0000000000..3705538e51 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageSampleRate.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_IMAGESAMPLERATE_H_ +#define _OCL_IMAGESAMPLERATE_H_ + +#include "OCLTestImp.h" + +class OCLPerfImageSampleRate : public OCLTestImp { + public: + OCLPerfImageSampleRate(); + virtual ~OCLPerfImageSampleRate(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void setData(cl_mem buffer, unsigned int data); + void checkData(cl_mem buffer); + void setKernel(void); + + cl_context context_; + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + cl_mem* inBuffer_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int width_; + unsigned int outBufWidth_; + unsigned int outBufSize_; + static const unsigned int MAX_ITERATIONS = 25; + unsigned int numBufs_; + unsigned int typeIdx_; +}; + +#endif // _OCL_IMAGESAMPLERATE_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageWriteSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageWriteSpeed.cpp new file mode 100644 index 0000000000..3886d3cfe9 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageWriteSpeed.cpp @@ -0,0 +1,317 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfImageWriteSpeed.h" + +#include +#include +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 4 +static const unsigned int Sizes[NUM_SIZES] = {256, 512, 1024, 2048}; + +#define NUM_FORMATS 1 +static const cl_image_format formats[NUM_FORMATS] = { + {CL_RGBA, CL_UNSIGNED_INT8}}; +static const char *textFormats[NUM_FORMATS] = {"R8G8B8A8"}; +static const unsigned int formatSize[NUM_FORMATS] = {4}; + +static const unsigned int Iterations[2] = {1, OCLPerfImageWriteSpeed::NUM_ITER}; + +OCLPerfImageWriteSpeed::OCLPerfImageWriteSpeed() { + _numSubTests = NUM_SIZES * NUM_FORMATS * 2; +} + +OCLPerfImageWriteSpeed::~OCLPerfImageWriteSpeed() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfImageWriteSpeed::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint typeOfDevice = type_; + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + outBuffer_ = 0; + memptr = NULL; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice, + 0, NULL, &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + // if (num_devices > 0) + //{ + // platform = platforms[_platformIndex]; + // break; + //} +#if 0 + } +#endif + delete platforms; + } + + bufSize_ = Sizes[_openTest % NUM_SIZES]; + bufnum_ = (_openTest / NUM_SIZES) % NUM_FORMATS; + numIter = Iterations[_openTest / (NUM_SIZES * NUM_FORMATS)]; + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices, + devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_mem_flags flags = CL_MEM_WRITE_ONLY; + outBuffer_ = _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], + bufSize_, bufSize_, 0, NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateImage(outBuffer) failed"); + memptr = new char[bufSize_ * bufSize_ * formatSize[bufnum_]]; +} + +void OCLPerfImageWriteSpeed::run(void) { + CPerfCounter timer; + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {bufSize_, bufSize_, 1}; + // Warm up + error_ = + _wrapper->clEnqueueWriteImage(cmd_queue_, outBuffer_, CL_TRUE, origin, + region, 0, 0, memptr, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueReadImage failed"); + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < numIter; i++) { + error_ = + _wrapper->clEnqueueWriteImage(cmd_queue_, outBuffer_, CL_TRUE, origin, + region, 0, 0, memptr, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueReadImage failed"); + } + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Image write bandwidth in GB/s + double perf = ((double)bufSize_ * bufSize_ * formatSize[bufnum_] * numIter * + (double)(1e-09)) / + sec; + + _perfInfo = (float)perf; + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s i: %4d (GB/s) ", bufSize_, + bufSize_, textFormats[bufnum_], numIter); + testDescString = buf; +} + +unsigned int OCLPerfImageWriteSpeed::close(void) { + if (memptr) { + delete memptr; + } + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} + +OCLPerfPinnedImageWriteSpeed::OCLPerfPinnedImageWriteSpeed() { + _numSubTests = NUM_SIZES * NUM_FORMATS * 2; +} + +OCLPerfPinnedImageWriteSpeed::~OCLPerfPinnedImageWriteSpeed() {} + +void OCLPerfPinnedImageWriteSpeed::open(unsigned int test, char *units, + double &conversion, + unsigned int deviceId) { + cl_uint typeOfDevice = type_; + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + outBuffer_ = 0; + memptr = NULL; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice, + 0, NULL, &num_devices); + delete platforms; + } + + bufSize_ = Sizes[_openTest % NUM_SIZES]; + bufnum_ = (_openTest / NUM_SIZES) % NUM_FORMATS; + numIter = Iterations[_openTest / (NUM_SIZES * NUM_FORMATS)]; + + CHECK_RESULT(platform == 0, "Couldn't find platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices, + devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_mem_flags flags = CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR; + inBuffer_ = _wrapper->clCreateBuffer( + context_, flags, bufSize_ * bufSize_ * formatSize[bufnum_], NULL, + &error_); + CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed"); + + flags = CL_MEM_WRITE_ONLY; + outBuffer_ = _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], + bufSize_, bufSize_, 0, NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateImage(outBuffer) failed"); + + memptr = (char *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, inBuffer_, CL_TRUE, CL_MAP_WRITE, 0, + bufSize_ * bufSize_ * formatSize[bufnum_], 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); +} + +unsigned int OCLPerfPinnedImageWriteSpeed::close(void) { + if (memptr) { + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, inBuffer_, memptr, 0, + NULL, NULL); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clEnqueueUnmapMemObject(inBuffer_) failed"); + clFinish(cmd_queue_); + } + if (inBuffer_) { + error_ = _wrapper->clReleaseMemObject(inBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageWriteSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageWriteSpeed.h new file mode 100644 index 0000000000..20fec5124a --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfImageWriteSpeed.h @@ -0,0 +1,62 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_ImageWriteSpeed_H_ +#define _OCL_ImageWriteSpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerfImageWriteSpeed : public OCLTestImp { + public: + OCLPerfImageWriteSpeed(); + virtual ~OCLPerfImageWriteSpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 100; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int bufSize_; + unsigned int bufnum_; + unsigned int numIter; + char* memptr; +}; + +class OCLPerfPinnedImageWriteSpeed : public OCLPerfImageWriteSpeed { + public: + OCLPerfPinnedImageWriteSpeed(); + virtual ~OCLPerfPinnedImageWriteSpeed(); + + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual unsigned int close(void); + + cl_mem inBuffer_; +}; + +#endif // _OCL_ImageWriteSpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelArguments.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelArguments.cpp new file mode 100644 index 0000000000..96310d5b52 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelArguments.cpp @@ -0,0 +1,239 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfKernelArguments.h" + +#include +#include +#include + +#include +#include + +#include "CL/cl.h" +#include "CL/cl_ext.h" + +static const size_t BufSize = 0x1000; +static const size_t Iterations = 0x10000; +static const size_t TotalQueues = 4; +static const size_t NumBufCnts = 4; +static const size_t TotalArgs = 4; + +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +static const char* Arguments[TotalArgs] = { + "__global uint* out", + "__global uint* out, __global uint* buf0, __global uint* buf1, __global " + "uint* buf2, __global uint* buf3", + "__global uint* out, __global uint* buf0, __global uint* buf1, __global " + "uint* buf2, __global uint* buf3, \n" + "__global uint* buf4, __global uint* buf5, __global uint* buf6, __global " + "uint* buf7, __global uint* buf8", + "__global uint* out, __global uint* buf0, __global uint* buf1, __global " + "uint* buf2, __global uint* buf3,\n" + "__global uint* buf4, __global uint* buf5, __global uint* buf6, __global " + "uint* buf7, __global uint* buf8,\n" + "__global uint* buf9, __global uint* buf10, __global uint* buf11, __global " + "uint* buf12, __global uint* buf13,\n" + "__global uint* buf14, __global uint* buf15, __global uint* buf16, " + "__global uint* buf17, __global uint* buf18"}; + +static const char* strKernel = + "__kernel void dummy(%s) \n" + "{ \n" + " uint id = get_global_id(0); \n" + " uint value = 1; \n" + " out[id] = value; \n" + "} \n"; + +OCLPerfKernelArguments::OCLPerfKernelArguments() { + _numSubTests = TotalQueues * TotalArgs * NumBufCnts * 2; + failed_ = false; +} + +OCLPerfKernelArguments::~OCLPerfKernelArguments() {} + +void OCLPerfKernelArguments::open(unsigned int test, char* units, + double& conversion, unsigned int deviceId) { + cl_mem buffer; + _deviceId = deviceId; + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + test_ = test; + cl_device_type deviceType; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE, + sizeof(deviceType), &deviceType, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed"); + + if (!(deviceType & CL_DEVICE_TYPE_GPU)) { + printf("GPU device is required for this test!\n"); + failed_ = true; + return; + } + perBatch_ = test >= (TotalQueues * TotalArgs * NumBufCnts); + + size_t numArguments = (test_ / TotalQueues) % TotalArgs; + char* program = new char[4096]; + SNPRINTF(program, sizeof(char) * 4096, strKernel, Arguments[numArguments]); + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char**)&program, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + kernel_ = _wrapper->clCreateKernel(program_, "dummy", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + delete[] program; + + static const size_t NumBuffs[NumBufCnts] = {0x20, 0x100, 0x800, 0x2000}; + + size_t numMems = NumBuffs[(test_ / (TotalQueues * TotalArgs)) % NumBufCnts]; + size_t bufSize = BufSize * sizeof(cl_int4); + for (size_t b = 0; b < numMems; ++b) { + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, bufSize, + NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + } +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLPerfKernelArguments::run(void) { + if (failed_) { + return; + } + unsigned int* values; + values = reinterpret_cast(new cl_int4[BufSize]); + CPerfCounter timer; + static const size_t Queues[] = {1, 2, 4, 8}; + size_t numQueues = Queues[test_ % TotalQueues]; + cl_uint numArguments; + _wrapper->clGetKernelInfo(kernel_, CL_KERNEL_NUM_ARGS, sizeof(cl_uint), + &numArguments, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clGetKernelInfo() failed"); + + // Clear destination buffer + memset(values, 0, BufSize * sizeof(cl_int4)); + + size_t iter = Iterations / numQueues / buffers_.size(); + iter = (iter == 0) ? 1 : iter; + + std::vector cmdQueues(numQueues); + for (size_t q = 0; q < numQueues; ++q) { + cl_command_queue cmdQueue = _wrapper->clCreateCommandQueue( + context_, devices_[_deviceId], 0, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed"); + cmdQueues[q] = cmdQueue; + } + // Warm-up + for (size_t b = 0; b < (buffers_.size() / numArguments); ++b) { + for (size_t q = 0; q < numQueues; ++q) { + for (cl_uint a = 0; a < numArguments; ++a) { + cl_mem buffer = buffers()[(b * numArguments + a) % buffers_.size()]; + error_ = _wrapper->clSetKernelArg(kernel_, a, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + } + + size_t gws[1] = {256}; + size_t lws[1] = {256}; + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL, + gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + } + for (size_t q = 0; q < numQueues; ++q) { + _wrapper->clFinish(cmdQueues[q]); + } + + size_t disp = 0; + timer.Reset(); + timer.Start(); + + for (size_t i = 0; i < iter; ++i) { + for (size_t b = 0; b < buffers_.size(); ++b) { + for (size_t q = 0; q < numQueues; ++q) { + for (cl_uint a = 0; a < numArguments; ++a) { + cl_mem buffer = buffers()[(b * numArguments + a) % buffers_.size()]; + error_ = + _wrapper->clSetKernelArg(kernel_, a, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + } + + size_t gws[1] = {256}; + size_t lws[1] = {256}; + error_ = _wrapper->clEnqueueNDRangeKernel( + cmdQueues[q], kernel_, 1, NULL, gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + disp++; + if (perBatch_) { + _wrapper->clFlush(cmdQueues[q]); + } + } + if (perBatch_) { + for (size_t q = 0; q < numQueues; ++q) { + _wrapper->clFinish(cmdQueues[q]); + } + } + } + } + for (size_t q = 0; q < numQueues; ++q) { + _wrapper->clFinish(cmdQueues[q]); + } + timer.Stop(); + + for (size_t q = 0; q < numQueues; ++q) { + error_ = _wrapper->clReleaseCommandQueue(cmdQueues[q]); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), + "clReleaseCommandQueue() failed"); + } + + std::stringstream stream; + if (perBatch_) + stream << "Time per batch (us) for " << numQueues << " queues, "; + else + stream << "Time per dispatch (us) for " << numQueues << " queues, "; + stream.flags(std::ios::right | std::ios::showbase); + stream.width(2); + stream << numArguments; + stream << " args, "; + stream.flags(std::ios::right | std::ios::showbase); + stream.width(4); + stream << buffers_.size() << " bufs"; + testDescString = stream.str(); + _perfInfo = static_cast(timer.GetElapsedTime() * 1000000 / disp); + delete[] values; +} + +unsigned int OCLPerfKernelArguments::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelArguments.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelArguments.h new file mode 100644 index 0000000000..997ac22e59 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelArguments.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PERF_KERNEL_ARGUMENTS_H_ +#define _OCL_PERF_KERNEL_ARGUMENTS_H_ + +#include "OCLTestImp.h" + +class OCLPerfKernelArguments : public OCLTestImp { + public: + OCLPerfKernelArguments(); + virtual ~OCLPerfKernelArguments(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; + unsigned int test_; + bool perBatch_; +}; + +#endif // _OCL_PERF_KERNEL_ARGUMENTS_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.cpp new file mode 100644 index 0000000000..860d016bb7 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.cpp @@ -0,0 +1,1008 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfKernelThroughput.h" + +#include +#include +#include +#include + +#include + +#include "CL/cl.h" +#include "Timer.h" + +#define DO_GPU_KERNELS 1 + +#if 0 +#define ENTER(X) printf("Entering %s\n", X); +#define EXIT(X) printf("Exiting %s\n", X); +#define PKT(X) X +#else +#define ENTER(X) +#define EXIT(X) +#define PKT(X) +#endif + +// work with multiples of 128 +#define ROUND_MULT(VAL, MULT) ((VAL / MULT) * MULT) +/* +int roundUp( int numToRound, int multiple) +{ + int r = numToRound % multiple; + if (r == 0) + { + return numToRound; + } else { + return numToRound + multiple - remainder; + } +} +*/ +// quiety warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define WORK_GROUP_SIZE 256 + +/******************************************************************************* + * Enumerated Types for Tests + ******************************************************************************/ + +// memory operations +const LARGE_INT numKernelTypes = 2; +static const char *kernelType[numKernelTypes] = {"MatMul", "Madds"}; + +// source/read memory locations +const LARGE_INT numMemPaths = 2; +static const char *memPath[numMemPaths] = {"Host", "Device"}; + +// buffer size +const LARGE_INT numNumElements = 12; // 15; +static const LARGE_INT numElements[numNumElements] = { + 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, + 16777216 //, + // 67108864, + // 268435456 +}; + +// flops/byte +const LARGE_INT numWorkSizes = 5; +static const LARGE_INT workSize[numWorkSizes] = {1, 4, 16, 64, 256}; + +const float initFloat = 0.001f; +const float zeroFloat = 0.0f; + +#define WORK_GROUP_SIZE 256 + +/******************************************************************************* + * Write the Matrix Multiply Shader Kernel + ******************************************************************************/ +void OCLPerfKernelThroughput::genShaderMatrixMultiply() { + ENTER("genShaderMatrixMultiply"); + + std::stringstream ss; + ss.clear(); +#if 0 + printf("%ix%i * %ix%i = %ix%i:\n", + matrixDim1_, matrixDim2_, + matrixDim2_, matrixDim1_, + matrixDim1_, matrixDim1_ + ); +#endif + ss << "#define BLOCK_SIZE 16\n" + "#define HA " + << matrixDim1_ + << "\n" + "#define WA " + << matrixDim2_ + << "\n" + "#define HB WA\n" + "#define WB HA\n" + "#define HC HA\n" + "#define WC WB\n" + "__kernel void\n" + "__attribute__((reqd_work_group_size(16,16,1)))\n" + "kernel1(\n" + " __global float * restrict C,\n" + " __global float * restrict A,\n" + " __global float * restrict B )\n" + "{\n" + " int bx = get_group_id(0);\n" + " int by = get_group_id(1);\n" + " int tx = get_local_id(0);\n" + " int ty = get_local_id(1);\n" + " int aBegin = WA * BLOCK_SIZE * by;\n" + " int aEnd = aBegin + WA - 1;\n" + " int aStep = BLOCK_SIZE;\n" + " int bBegin = BLOCK_SIZE * bx;\n" + " int bStep = BLOCK_SIZE * WB;\n" + " __private float c = 0.f;\n" + " __local float localA[BLOCK_SIZE][BLOCK_SIZE];\n" + " __local float localB[BLOCK_SIZE][BLOCK_SIZE];\n" + " for (\n" + " int a = aBegin, b = bBegin;\n" + " a <= aEnd;\n" + " a += aStep, b += bStep)\n" + " {\n" + " localA[ty][tx] = (get_global_id(0) < WA && get_global_id(1) < " + "HA) ? A[a + WA * ty + tx] : 0;\n" + " localB[ty][tx] = (get_global_id(0) < WB && get_global_id(1) < " + "HB) ? B[b + WB * ty + tx] : 0;\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " for (int k = 0; k < BLOCK_SIZE; ++k)\n" + " c += localA[ty][k] * localB[k][tx];\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " }\n" + " int cIdx = WB * BLOCK_SIZE * by + BLOCK_SIZE * bx + WB * ty + tx;\n" + " if (get_global_id(0) < WC && get_global_id(1) < WC)\n" + " {\n" + " C[cIdx] = c;\n" + " }\n" + "}\n"; + + shader_ = ss.str(); + gold_ = 0.f; + for (int i = 0; i < matrixDim2_; i++) gold_ += initFloat * initFloat; + // gold_ = initFloat * initFloat * matrixDim2_; + // printf("shader:\n%s\n", shader_.c_str()); + // printf("gold_: %f\n", gold_); + EXIT("genShaderMatrixMultiply"); +} + +/******************************************************************************* + * Write the Madds Shader Kernel + ******************************************************************************/ +void OCLPerfKernelThroughput::genShaderMadds() { + ENTER("genShaderMadds"); + + int flopLoopIter = 2 * (flopsPerByte_ * 4 * 4) / 16; // bytes, flops + + std::stringstream ss; + ss.clear(); + float a, b; + + ss << // begin kernel + "__kernel void\n" + "__attribute__((reqd_work_group_size(" + << 256 + << ",1,1)))\n" + "kernel1(\n" + " __global float4 * restrict input,\n" + " __global float4 * restrict output )\n" + "{\n"; + + // begin loop + ss << " for ( uint idx = get_global_id(0);\n" + " idx < " + << numElements[numElementsIdx_] + << ";\n" + " idx += get_global_size(0) )\n" + " {\n"; + + // do load + ss << " float4 prefetch = input[ idx ];\n" + " float a0 = prefetch.x;\n" + " float a1 = prefetch.y;\n" + " float a2 = prefetch.z;\n" + " float a3 = prefetch.w;\n" + " float b0 = a0;\n" + " float b1 = a1;\n" + " float b2 = a2;\n" + " float b3 = a3;\n"; + a = initFloat; + b = a; + + // do math + for (int i = 0; i < flopLoopIter; i++) { + ss << " a0 += b3*b1;\n" + " a1 += b0*b2;\n" + " a2 += b1*b3;\n" + " a3 += b2*b0;\n" + " b0 += a3*a1;\n" + " b1 += a0*a2;\n" + " b2 += a1*a3;\n" + " b3 += a2*a0;\n"; + // printf("a += b*b; %f += %f*%f\n", a, b, b); + a += b * b; + // printf("b += a*a; %f += %f*%f\n", b, a, a); + b += a * a; + } + + // do write or accumulate + ss << " __private float4 tmp;\n" + " tmp.x = b0;\n" + " tmp.y = b1;\n" + " tmp.z = b2;\n" + " tmp.w = b3;\n" + " output[ idx ] = tmp;\n"; + gold_ = b; + // printf("GPU gold_ Tmp: %f\n", gold_); + + // end loop + ss << " } // end loop\n"; + // end kernel + ss << " } // end kernel\n\n"; + + shader_ = ss.str(); + // printf("shader:\n%s\n", shader_.c_str()); + // printf("gold_: %f\n", gold_); + EXIT("genShaderMadds"); +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +/******************************************************************************* + * Constructor + ******************************************************************************/ +OCLPerfKernelThroughput::OCLPerfKernelThroughput() { + ENTER("constructor"); + _numSubTests = numKernelTypes * numMemPaths * numNumElements * numWorkSizes; + + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + context_ = 0; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + // Get last for default + platform = platforms[numPlatforms - 1]; + for (unsigned i = 0; i < numPlatforms; ++i) { + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, + sizeof(pbuf), pbuf, NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = + _wrapper->clGetDeviceIDs(platforms[i], type_, 0, NULL, &num_devices); + // Runtime returns an error when no GPU devices are present + // instead of just returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + // printf("NumDevices: %i\n", num_devices); + platform = platforms[i]; + break; + } + } + delete platforms; + } + + /* + * If we could find our platform, use it, else die. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + // get gpu speed + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, + sizeof(maxClockFrequency_), + &maxClockFrequency_, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(maxComputeUnits_), + &maxComputeUnits_, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (maxComputeUnits_ > 8) { + // printf("%i CUs reported; assuming 8 instead.", maxComputeUnits_); + maxComputeUnits_ = 8; + } + // printf("Compute Units: %i\n", maxComputeUnits_); + + // printf("Subtests: %i\n", _numSubTests); + + // create context + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + char charbuf[1024]; + size_t retsize; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024, + charbuf, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + cl_uint tmp; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(tmp), &tmp, NULL); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + // printf("NumComputeUnits: %u\n", tmp); + maxComputeUnits_ = static_cast(tmp); + // printf("NumComputeUnits: %lld\n", maxComputeUnits_); + EXIT("constructor"); +} + +OCLPerfKernelThroughput::~OCLPerfKernelThroughput() {} + +/******************************************************************************* + * Open - initializes test, compile GPU kernel + ******************************************************************************/ +void OCLPerfKernelThroughput::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + ENTER("open"); + /*********************************************************** + * select subtest + **********************************************************/ + int testIdx = + test + numKernelTypes * numMemPaths * numNumElements * numWorkSizes; + memPathIdx_ = testIdx % numMemPaths; + testIdx /= numMemPaths; + numElementsIdx_ = testIdx % numNumElements; + testIdx /= numNumElements; + workSizeIdx_ = testIdx % numWorkSizes; + testIdx /= numWorkSizes; + kernelTypeIdx_ = testIdx % numKernelTypes; + testIdx /= numKernelTypes; + + // float md1; + + // kernel values + switch (kernelTypeIdx_) { + case 0: // Matrix Multiply + // md1 = sqrt(1.f*numElements[numElementsIdx_]); + // printf("MD1: sqrt(%f) = %f\n", 1.f*numElements[numElementsIdx_],md1); + matrixDim1_ = static_cast(sqrt(1.f * numElements[numElementsIdx_])); + matrixDim2_ = matrixDim1_ * (int)workSize[workSizeIdx_]; + genShaderMatrixMultiply(); + work_dim_ = 2; + global_work_size_ = new size_t[work_dim_]; + global_work_size_[0] = ((matrixDim1_ - 1) / 16 + 1) * + 16; // matrixDim1_ < 16 ? 16 : matrixDim1_; + global_work_size_[1] = global_work_size_[0]; + local_work_size_ = new size_t[work_dim_]; + local_work_size_[0] = 16; + local_work_size_[1] = local_work_size_[0]; + /* + printf("Global: %ix%i; Local: %ix%i; Matrix: %ix%i\n", + global_work_size_[0], + global_work_size_[1], + local_work_size_[0], + local_work_size_[1], + matrixDim1_, + matrixDim2_ + ); + */ + input1BufferSize_ = + static_cast(matrixDim1_ * matrixDim2_ * sizeof(float)); + input2BufferSize_ = + static_cast(matrixDim2_ * matrixDim1_ * sizeof(float)); + output1BufferSize_ = + static_cast(matrixDim1_ * matrixDim1_ * sizeof(float)); + _reqDataSize = (1.0 * matrixDim1_ * matrixDim2_ * sizeof(float)) + + (1.0 * matrixDim2_ * matrixDim1_ * sizeof(float)) + + (1.0 * matrixDim1_ * matrixDim1_ * sizeof(float)); + break; + case 1: // Flops/Byte + flopsPerByte_ = (int)workSize[workSizeIdx_]; // for kernelType == 0 + genShaderMadds(); + numWorkGroupsPerComputeUnit_ = 32; // TODO + numThreads_ = + numWorkGroupsPerComputeUnit_ * maxComputeUnits_ * WORK_GROUP_SIZE; + work_dim_ = 1; + global_work_size_ = new size_t[work_dim_]; + local_work_size_ = new size_t[work_dim_]; + global_work_size_[0] = numThreads_; + local_work_size_[0] = WORK_GROUP_SIZE; + input1BufferSize_ = + static_cast(numElements[numElementsIdx_] * sizeof(float4)); + input2BufferSize_ = 0; + output1BufferSize_ = + static_cast(numElements[numElementsIdx_] * sizeof(float4)); + _reqDataSize = 2.0 * numElements[numElementsIdx_] * sizeof(float4); + break; + } + + PKT(printf("Test Parameters:\n" + "\tkernelTypeIdx: %i\n" + "\tmemPathIdx: %i\n" + "\tnumElementsIdx: %i\n" + "\tworkSizeIdx: %i\n" + "\n\n", + kernelTypeIdx_, memPathIdx_, numElementsIdx_, workSizeIdx_);) + + /*********************************************************** + * get context and queue + **********************************************************/ + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0; + _deviceId = deviceId; + + context_ = 0; + cmd_queue_ = 0; + program_ = 0; + kernel_ = 0; + input1Buffer_ = 0; + output1Buffer_ = 0; + _errorFlag = false; // Reset error code so a single error + // doesn't prevent other subtests from running + _errorMsg = ""; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present + // instead of just returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + delete platforms; + } + + /* + * If we could find our platform, use it, else die. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* + * Get the requested device + */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + device = devices[0]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, + CL_QUEUE_PROFILING_ENABLE, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + // Global memory size + cl_ulong _maxMemoryAllocationSize; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(cl_ulong), + &_maxMemoryAllocationSize, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, + "clGetDeviceIDs(CL_DEVICE_GLOBAL_MEM_SIZE) failed"); +#if 0 + printf("Buffer Sizes: %i %i %i = %f\n", + input1BufferSize_, + input2BufferSize_, + output1BufferSize_, + _reqDataSize); +#endif + _dataSizeTooBig = (_reqDataSize > _maxMemoryAllocationSize); + if (_dataSizeTooBig) { + // printf("DATA TOO LARGE FOR DEVICE !!!"); + return; + } + + // create kernel + char *tmp = (char *)shader_.c_str(); + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&tmp, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + std::string args; + args.clear(); + error_ = + _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL); + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "kernel1", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + + /*********************************************************** + * Allocate GPU Memory + **********************************************************/ + cl_mem_flags inputBufferFlags = 0; + cl_mem_flags outputBufferFlags = 0; + + // choose gpu source buffer type + switch (memPathIdx_) { + case 0: // host memory + // printf("Allocating Host Memories\n"); + // allocate "device" memory + inputBufferFlags = CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR; + outputBufferFlags = CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR; + input1Buffer_ = _wrapper->clCreateBuffer( + context_, inputBufferFlags, input1BufferSize_, NULL, &error_); + CHECK_RESULT(input1Buffer_ == 0, "clCreateBuffer Input failed"); + if (input1Buffer_ == 0) printf("Error: %i\n", error_); + if (input2BufferSize_) { + input2Buffer_ = _wrapper->clCreateBuffer( + context_, inputBufferFlags, input2BufferSize_, NULL, &error_); + CHECK_RESULT(input2Buffer_ == 0, "clCreateBuffer Input failed"); + } + output1Buffer_ = _wrapper->clCreateBuffer( + context_, outputBufferFlags, output1BufferSize_, NULL, &error_); + CHECK_RESULT(output1Buffer_ == 0, "clCreateBuffer Input failed"); + if (output1Buffer_ == 0) printf("Error: %i\n", error_); + + // map host memory + input1Ptr_ = (float *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, input1Buffer_, true, CL_MAP_WRITE, 0, input1BufferSize_, + 0, NULL, NULL, &error_); + if (input2BufferSize_) { + input2Ptr_ = (float *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, input2Buffer_, true, CL_MAP_WRITE, 0, input2BufferSize_, + 0, NULL, NULL, &error_); + } + output1Ptr_ = (float *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, output1Buffer_, true, CL_MAP_READ, 0, output1BufferSize_, + 0, NULL, NULL, &error_); + _wrapper->clFinish(cmd_queue_); + break; + + case 1: // device memory + // printf("Allocating Device Memories\n"); + // allocate device memory + inputBufferFlags = CL_MEM_READ_WRITE; + outputBufferFlags = CL_MEM_READ_WRITE; + input1Buffer_ = _wrapper->clCreateBuffer( + context_, inputBufferFlags, input1BufferSize_, NULL, &error_); + CHECK_RESULT(input1Buffer_ == 0, "clCreateBuffer Input failed"); + if (input2BufferSize_) { + input2Buffer_ = _wrapper->clCreateBuffer( + context_, inputBufferFlags, input2BufferSize_, NULL, &error_); + CHECK_RESULT(input2Buffer_ == 0, "clCreateBuffer Input failed"); + } + output1Buffer_ = _wrapper->clCreateBuffer( + context_, outputBufferFlags, output1BufferSize_, NULL, &error_); + CHECK_RESULT(output1Buffer_ == 0, "clCreateBuffer Input failed"); + // printf("\tDone Allocating Device Memory\n"); + + // allocate host memory + input1Ptr_ = new float[input1BufferSize_ / sizeof(float)]; + if (input2BufferSize_) { + input2Ptr_ = new float[input2BufferSize_ / sizeof(float)]; + } + output1Ptr_ = new float[output1BufferSize_ / sizeof(float)]; + // printf("\tDone Allocating Host Memory\n"); + + break; + default: + CHECK_RESULT(1, "Invalid Memory Path Idx"); + // invalid + } + for (unsigned int i = 0; i < input1BufferSize_ / sizeof(float); i++) { + input1Ptr_[i] = initFloat; + } + for (unsigned int i = 0; i < input2BufferSize_ / sizeof(float); i++) { + input2Ptr_[i] = initFloat; + } + for (unsigned int i = 0; i < output1BufferSize_ / sizeof(float); i++) { + output1Ptr_[i] = zeroFloat; + } + +#if 0 + printf("Allocating GPU: %.0fMB, %.0fMB\n", + static_cast(1.f*input1BufferSize_/1024.f/1024.f), + static_cast(1.f*output1BufferSize_/1024.f/1024.f)); + input1Buffer_ = _wrapper->clCreateBuffer( + context_, inputBufferFlags, input1BufferSize_, NULL, &error_); + CHECK_RESULT(input1Buffer_ == 0, "clCreateBuffer Input failed"); + output1Buffer_ = _wrapper->clCreateBuffer( + context_, outputBufferFlags, output1BufferSize_, NULL, &error_); + CHECK_RESULT(output1Buffer_ == 0, "clCreateBuffer Output failed"); + error_ = /*_wrapper->*/clEnqueueFillBuffer( + cmd_queue_, input1Buffer_, &initFloat, sizeof(initFloat), + 0, input1BufferSize_, 0, NULL, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueFillBuffer failed"); + error_ = /*_wrapper->*/clEnqueueFillBuffer( + cmd_queue_, output1Buffer_, &zeroFloat, sizeof(zeroFloat), + 0, output1BufferSize_, 0, NULL, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueFillBuffer failed"); + + /*********************************************************** + * Set Kernel Args + **********************************************************/ + error_ = _wrapper->clSetKernelArg( + kernel_, 0, sizeof(input1Buffer_), (void *) &input1Buffer_); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); + error_ = _wrapper->clSetKernelArg( + kernel_, 1, sizeof(output1Buffer_), (void *) &output1Buffer_); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); +#endif + + EXIT("open"); +} + +/******************************************************************************* + * Run - execute full test once and return performance + ******************************************************************************/ +void OCLPerfKernelThroughput::run(void) { + ENTER("run"); + CPerfCounter timer; + if (!_dataSizeTooBig) { + // set kernel args +#if 1 + switch (kernelTypeIdx_) { + case 0: // Matrix Multiply + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(output1Buffer_), + (void *)&output1Buffer_); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(input1Buffer_), + (void *)&input1Buffer_); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); + error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(input2Buffer_), + (void *)&input2Buffer_); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); + break; + case 1: // Flops/Byte + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(input1Buffer_), + (void *)&input1Buffer_); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(output1Buffer_), + (void *)&output1Buffer_); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); + break; + } +#endif + launchKernel(); + timer.Reset(); + timer.Start(); + for (int i = 0; i < MAX_LOOP_ITER; i++) { + launchKernel(); + } + timer.Stop(); + } // data not too large + double totalSec = _dataSizeTooBig ? 1 : timer.GetElapsedTime(); + // printf("Total Time: %f seconds\n", totalSec); + // printf("Average Kernel Time: %f seconds\n", totalSec / MAX_LOOP_ITER); + + // analyze performance + avgKernelTime_ = (float)(totalSec / MAX_LOOP_ITER * 1000000); // microseconds + double flopCount; + switch (kernelTypeIdx_) { + case 0: // Matrix Multiply + flopCount = (2.0 * matrixDim1_ * matrixDim1_ * matrixDim2_); + // printf("FlopCount = 2*%i*%i*%i=%f\n", + // matrixDim1_,matrixDim1_,matrixDim2_,flopCount); + bandwidth_ = (float)(1.f * _reqDataSize / 1024.f / 1024.f / 1024.f) * + 1000000.f / avgKernelTime_; // GB/s + gflops_ = (float)(1000000.f * flopCount / avgKernelTime_ / 1000000000.0); + break; + case 1: // Madds + flopCount = _reqDataSize * flopsPerByte_; + bandwidth_ = (float)(1.f * _reqDataSize / 1024.f / 1024.f / 1024.f) * + 1000000.f / avgKernelTime_; // GB/s + gflops_ = bandwidth_ * flopsPerByte_; + break; + } + if (_dataSizeTooBig) { + printf("REQUESTED DATA SIZE EXCEEDS GLOBAL MEMORY !!!\n"); + bandwidth_ = 0; + gflops_ = 0; + avgKernelTime_ = 0; + } + // here print out details + char buf[512]; + int bytesWritten; + bytesWritten = SNPRINTF( + buf, sizeof(buf), + "Kernel:%7s; " + "Work:%4i; " + "Buff:%11.0f; " + "Path:%7s; " + "%10.5e GB/s; " + "%10.5e GFlop/s; ", + kernelType[kernelTypeIdx_], static_cast(workSize[workSizeIdx_]), + _reqDataSize, memPath[memPathIdx_], bandwidth_, gflops_); + testDescString = buf; + _perfInfo = avgKernelTime_; + if (!_dataSizeTooBig) checkData(); + EXIT("run"); +} + +void OCLPerfKernelThroughput::launchKernel(void) { + ENTER("launchKernel") + /*********************************************************** + * Copy Data To + **********************************************************/ + // printf("Copying Data To Device\n"); + switch (memPathIdx_) { + case 0: // zero copy + // do nothing + // void *inputPtr = _wrapper->clEnqueueMapBuffer( + // cmd_queue_, input1Buffer_, true, CL_MAP_READ, + // 0, input1BufferSize_, 0, NULL, NULL, &error_); + // void *outputPtr = _wrapper->clEnqueueMapBuffer( + // cmd_queue_, output1Buffer_, true, CL_MAP_READ, + // 0, output1BufferSize_, 0, NULL, NULL, &error_); + //_wrapper->clFinish(cmd_queue_); + break; + case 1: // explicit copy to device memory + // printf("Queue: %p\n", &cmd_queue_); + // printf("devBuffer: %i\n", input1Buffer_); + // printf("hstBuffer: %p\n", input1Ptr_); + // printf("bufSize: %i\n", input1BufferSize_); + error_ = _wrapper->clEnqueueWriteBuffer( + cmd_queue_, input1Buffer_, true, 0, input1BufferSize_, + (const void *)input1Ptr_, 0, NULL, NULL); + if (input2BufferSize_) { + error_ = _wrapper->clEnqueueWriteBuffer( + cmd_queue_, input2Buffer_, true, 0, input2BufferSize_, + (const void *)input2Ptr_, 0, NULL, NULL); + } + // printf("Error: %i\n", error_); + std::fflush(stdout); + _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_ != CL_SUCCESS, "clWriteBuffer failed"); + //_error = _wrapper->clEnqueueWriteBuffer( + // cmd_queue_, output1Buffer_, true, 0, output1BufferSize_, + // (const void *)output1Ptr_, 0, NULL, NULL ); + // CHECK_RESULT(error_ != CL_SUCCESS, "clWriteBuffer failed"); + break; + } + + /*********************************************************** + * Set Kernel Args + **********************************************************/ +#if 0 + error_ = _wrapper->clSetKernelArg( + kernel_, 0, sizeof(input1Buffer_), (void *) &input1Buffer_); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); + error_ = _wrapper->clSetKernelArg( + kernel_, 1, sizeof(output1Buffer_), (void *) &output1Buffer_); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); +#endif + + // printf("Launching Kernel: %ix%i threads\n", global_work_size_[0], + // local_work_size_[0]); + + /*********************************************************** + * Launch Kernel + **********************************************************/ + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, work_dim_, NULL, (const size_t *)global_work_size_, + (const size_t *)local_work_size_, 0, NULL, NULL); + // printf("Error: %i\n", error_); + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + _wrapper->clFinish(cmd_queue_); + + /*********************************************************** + * Copy Data From + **********************************************************/ + // printf("Copying Data From Device\n"); + switch (memPathIdx_) { + case 0: // zero copy + // do nothing + // void *inputPtr = _wrapper->clEnqueueMapBuffer( + // cmd_queue_, input1Buffer_, true, CL_MAP_READ, + // 0, input1BufferSize_, 0, NULL, NULL, &error_); + // void *outputPtr = _wrapper->clEnqueueMapBuffer( + // cmd_queue_, output1Buffer_, true, CL_MAP_READ, + // 0, output1BufferSize_, 0, NULL, NULL, &error_); + //_wrapper->clFinish(cmd_queue_); + break; + case 1: // explicit copy to device memory + //_error = _wrapper->clEnqueueReadBuffer( + // cmd_queue_, input1Buffer_, true, 0, input1BufferSize_, + // (void *)input1Ptr_, 0, NULL, NULL ); + // CHECK_RESULT(error_ != CL_SUCCESS, "clWriteBuffer failed"); + // printf("VAL0 %p + error_ = _wrapper->clEnqueueReadBuffer( + cmd_queue_, output1Buffer_, true, 0, output1BufferSize_, + (void *)output1Ptr_, 0, NULL, NULL); + // printf("Error: %i\n", error_); + CHECK_RESULT(error_ != CL_SUCCESS, "clWriteBuffer failed"); + break; + } + + EXIT("launchKernel") +} + +/******************************************************************************* + * Check Data + ******************************************************************************/ +void OCLPerfKernelThroughput::checkData() { + _wrapper->clFinish(cmd_queue_); + float errorThreshhold = 0.00001f; + float eqMax = gold_ + errorThreshhold * gold_; + float eqMin = gold_ - errorThreshhold * gold_; + /* + printf("%ix%i * %ix%i = %ix%i:\n", + matrixDim1_, matrixDim2_, + matrixDim2_, matrixDim1_, + matrixDim1_, matrixDim1_ + ); + */ + for (unsigned int i = 0; i < output1BufferSize_ / sizeof(float); i++) { + float value = output1Ptr_[i]; + bool equal = (value > eqMin && value < eqMax); + if (!equal) { +#if 0 + printf("Output[%i] = %.6e; gold_ = %.6e; %s\n", + i, + value, + gold_, + equal ? "Equal" : "NOT Equal"); +#endif + // printf("FAILURE\n"); + // CHECK_RESULT_NO_RETURN(1, "Data validation failed!\n"); + _errorFlag = true; + break; + } else { + // printf("M[%i] = %.6e\n", i, output1Ptr_[i]); + } + } +} + +/******************************************************************************* + * Close - delete all data and release opencl objects + ******************************************************************************/ +unsigned int OCLPerfKernelThroughput::close(void) { + ENTER("close"); + _wrapper->clFinish(cmd_queue_); + + if (global_work_size_) { + delete[] global_work_size_; + global_work_size_ = NULL; + } + if (local_work_size_) { + delete[] local_work_size_; + local_work_size_ = NULL; + } + // switch for memory type + switch (memPathIdx_) { + case 0: // zero copy + // unmap ptr + if (input1Ptr_) { + error_ = /*_wrapper->*/ clEnqueueUnmapMemObject( + cmd_queue_, input1Buffer_, input1Ptr_, 0, NULL, NULL); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clEnqueueUnmapMemObject(input_) failed"); + _wrapper->clFinish(cmd_queue_); + error_ = _wrapper->clReleaseMemObject(input1Buffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(input1Buffer_) failed"); + input1Buffer_ = 0; + } + if (input2Ptr_) { + error_ = /*_wrapper->*/ clEnqueueUnmapMemObject( + cmd_queue_, input2Buffer_, input2Ptr_, 0, NULL, NULL); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clEnqueueUnmapMemObject(input_) failed"); + _wrapper->clFinish(cmd_queue_); + error_ = _wrapper->clReleaseMemObject(input2Buffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(input2Buffer_) failed"); + input2Buffer_ = 0; + } + if (output1Ptr_) { + error_ = /*_wrapper->*/ clEnqueueUnmapMemObject( + cmd_queue_, output1Buffer_, output1Ptr_, 0, NULL, NULL); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clEnqueueUnmapMemObject(output_) failed"); + _wrapper->clFinish(cmd_queue_); + error_ = _wrapper->clReleaseMemObject(output1Buffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(input1Buffer_) failed"); + output1Buffer_ = 0; + } + break; + case 1: // explicit copy to device memory + // release object + if (input1Buffer_) { + error_ = _wrapper->clReleaseMemObject(input1Buffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(input1Buffer_) failed"); + input1Buffer_ = 0; + } + if (input2Buffer_) { + error_ = _wrapper->clReleaseMemObject(input2Buffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(input2Buffer_) failed"); + input2Buffer_ = 0; + } + if (output1Buffer_) { + error_ = _wrapper->clReleaseMemObject(output1Buffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(input1Buffer_) failed"); + output1Buffer_ = 0; + } + if (input1Ptr_) { + delete[] input1Ptr_; + input1Ptr_ = 0; + } + if (input2Ptr_) { + delete[] input2Ptr_; + input2Ptr_ = 0; + } + if (output1Ptr_) { + delete[] output1Ptr_; + output1Ptr_ = 0; + } + break; + } + + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + kernel_ = 0; + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + program_ = 0; + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + cmd_queue_ = 0; + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + context_ = 0; + } + _wrapper->clFinish(cmd_queue_); + + EXIT("close"); + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.h new file mode 100644 index 0000000000..84777a1cdd --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfKernelThroughput.h @@ -0,0 +1,118 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +/******************************************************************************* + * Kernel Throughput + * + * + * + * + * + * + ******************************************************************************/ + +#ifndef _OCL_KernelThroughput_H_ +#define _OCL_KernelThroughput_H_ + +#ifdef WIN32 +#include "xmmintrin.h" +#endif + +#include "OCLTestImp.h" +//#include +//#define WIN32_LEAN_AND_MEAN //Restricts windows.h to include only the core +//API. #include "windows.h" #undef Yield #include #include +// #include #include + +#define LARGE_INT long long +#define UNSIGNED_LARGE_INT unsigned long long +#define MAX_LOOP_ITER 10 +typedef cl_float4 float4; +typedef void (*CPUKernel)(__m128 *, __m128 *, unsigned int); + +class OCLPerfKernelThroughput : public OCLTestImp { + public: + OCLPerfKernelThroughput(); + virtual ~OCLPerfKernelThroughput(); + + public: + virtual void open(unsigned int test, char *units, double &conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void genShaderMadds(); + void genShaderMatrixMultiply(); + void checkData(); + // void allocateBuffers(); + void launchKernel(); + + // test parameters + int kernelTypeIdx_; + int memPathIdx_; + int numElementsIdx_; + int workSizeIdx_; + float gold_; + double _reqDataSize; + bool _dataSizeTooBig; + + // device attributes + cl_uint maxComputeUnits_; + cl_uint maxClockFrequency_; + + LARGE_INT numComputeUnits_; + LARGE_INT numWorkGroupsPerComputeUnit_; + LARGE_INT numThreads_; + cl_uint work_dim_; + size_t *global_work_size_; + size_t *local_work_size_; + + // opencl objects + cl_context context_; + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + cl_int error_; + + // buffer sizes + + // kernel-specific values + int flopsPerByte_; + int matrixDim1_, matrixDim2_; + + // buffers + size_t input1BufferSize_; + size_t input2BufferSize_; + size_t output1BufferSize_; + cl_mem input1Buffer_; + cl_mem input2Buffer_; + cl_mem output1Buffer_; + float *input1Ptr_; + float *input2Ptr_; + float *output1Ptr_; + + // performance results + float bandwidth_; // GB/s + float gflops_; // GFlop/s + float avgKernelTime_; // microseconds +}; + +#endif // _OCL_KernelThroughput_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSLatency.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSLatency.cpp new file mode 100644 index 0000000000..495f8c1a32 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSLatency.cpp @@ -0,0 +1,432 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfLDSLatency.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +static const unsigned int NUM_SIZES = 5; +// 2k up to 64MB +static const unsigned int Sizes[NUM_SIZES] = {2048, 4096, 8192, 16384, 32768}; + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif +void OCLPerfLDSLatency::genShader() { + shader_.clear(); + + // DO NOT PUBLISH + // Adopted from SiSoft Sandra 2013's memory latency test + shader_ += + "__kernel\n" + //"__attribute__((work_group_size_hint(1, 1, 1)))\n" + "void MemWalker(\n" + " global uint * restrict input,\n" + " global uint * restrict output,\n" + " const uint uCount, const uint uSize,\n" + " const uint uOffset, const int bMem, const uint repeats)\n" + "{\n" + " uint o = uOffset;\n" + " uint lid = get_local_id(0);\n" + " uint x = lid*o;\n" + " local uint lclData[8192];\n" + "\n" + " {\n" + " uint i = uCount;\n" + " while (i--) {\n" + " uint oldX = x;\n" + " x = input[x];\n" + " lclData[oldX] = x;\n" + " }\n" + " }\n" + "\n" + " x = lid*uOffset;\n" + " for (uint loop = 0; loop < repeats; loop++) {\n" + " uint i = uCount;\n" + " while (i--) {\n" + " x = lclData[x] + o;\n" + " }\n" + " }\n" + "\n" + " output[0] = x;\n" + "}\n"; + + // printf("shader:\n%s\n", shader_.c_str()); + shader_ += "\n\n"; + shader_ += + "__kernel\n" + //"__attribute__((work_group_size_hint(1, 1, 1)))\n" + "void Overhead(\n" + " global uint * restrict input,\n" + " global uint * restrict output,\n" + " const uint uCount, const uint uSize,\n" + " const uint uOffset, const int bMem, const uint repeats)\n" + "{\n" + " local uint lclData[8192];\n" + "#ifdef USE_FLOAT\n" + " {\n" + " uint x = 0;\n" + " uint i = uCount;\n" + " while (i--) {\n" + " uint oldX = x;\n" + " x = input[x] /* + o*/;\n" + " lclData[oldX] = x;\n" + " }\n" + " }\n" + " float x = (float)input[0];\n" + " for (uint loop = 0; loop < repeats; loop++) {\n" + " uint i = uCount;\n" + " x = (float)uOffset*x;\n" + " while (i--) {\n" + " x += (float)i;\n" + " }\n" + " }\n" + " output[0] = (uint)x + uOffset*lclData[8191];\n" + "#else\n" + " {\n" + " uint x = 0;\n" + " uint i = uCount;\n" + " while (i--) {\n" + " uint oldX = x;\n" + " x = input[x] /* + o*/;\n" + " lclData[oldX] = x;\n" + " }\n" + " }\n" + " uint x = input[0];\n" + " for (uint loop = 0; loop < repeats; loop++) {\n" + " uint i = uCount;\n" + " x = x*uOffset;\n" + " while (i--) {\n" + " x += i;\n" + " }\n" + " }\n" + " output[0] = x + uOffset*lclData[8191];\n" + "#endif\n" + "}\n"; +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +OCLPerfLDSLatency::OCLPerfLDSLatency() { + _numSubTests = NUM_SIZES * 2; + maxSize_ = Sizes[NUM_SIZES - 1] * 2048; +} + +OCLPerfLDSLatency::~OCLPerfLDSLatency() {} + +void OCLPerfLDSLatency::setData(cl_mem buffer, unsigned int val) { + void *ptr = + _wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_WRITE, 0, + width_, 0, NULL, NULL, &error_); + unsigned int *data = (unsigned int *)ptr; + for (unsigned int i = 0; i < bufSizeDW_; i++) { + data[(i * (1024 + 17)) % bufSizeDW_] = ((i + 1) * (1024 + 17)) % bufSizeDW_; + } + error_ = + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL); + clFinish(cmd_queue_); +} + +void OCLPerfLDSLatency::checkData(cl_mem buffer) { + void *ptr = + _wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_READ, 0, + sizeof(cl_uint), 0, NULL, NULL, &error_); + + unsigned int *data = (unsigned int *)ptr; + if (data[0] != 0) { + printf("OutData= 0x%08x\n", data[0]); + CHECK_RESULT_NO_RETURN(data[0] != 0, "Data validation failed!\n"); + } + error_ = + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL); +} + +void OCLPerfLDSLatency::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + moreThreads = false; + + context_ = 0; + cmd_queue_ = 0; + program_ = 0; + kernel_ = 0; + inBuffer_ = 0; + outBuffer_ = 0; + _errorFlag = false; // Reset error code so a single error doesn't prevent + // other subtests from running + _errorMsg = ""; + isAMD_ = false; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + isAMD_ = true; + } + } + + delete platforms; + } + + width_ = Sizes[test % NUM_SIZES]; + + bufSizeDW_ = width_ / sizeof(cl_uint); + moreThreads = ((test / NUM_SIZES) % 2) ? true : false; + + CHECK_RESULT(platform == 0, "Couldn't find OpenCL platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "Failed to allocate devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + device = devices[0]; + + free(devices); + devices = NULL; + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_uint flags; + flags = 0; + inBuffer_ = _wrapper->clCreateBuffer(context_, flags, width_, NULL, &error_); + CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed"); + + outBuffer_ = + _wrapper->clCreateBuffer(context_, 0, 1 * sizeof(cl_uint), NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + genShader(); + char *tmp = (char *)shader_.c_str(); + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&tmp, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + std::string args; + args.clear(); + if (isAMD_) args += " -D USE_FLOAT"; + + error_ = + _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL); + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "MemWalker", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel(MemWalker) failed"); + + kernel2_ = _wrapper->clCreateKernel(program_, "Overhead", &error_); + CHECK_RESULT(kernel2_ == 0, "clCreateKernel(Overhead) failed"); + + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&inBuffer_); + error_ = + _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_); + error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint), + (void *)&bufSizeDW_); + error_ = _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_uint), + (void *)&bufSizeDW_); + unsigned int zero = 0; + error_ = _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_uint), (void *)&zero); + int bMem = 1; + error_ = _wrapper->clSetKernelArg(kernel_, 5, sizeof(cl_int), (void *)&bMem); + // Limit the repeats, large buffers will have more samples, but the test runs + // for a long time + repeats_ = std::max((maxSize_ >> 4) / bufSizeDW_, 1u); + error_ = + _wrapper->clSetKernelArg(kernel_, 6, sizeof(cl_uint), (void *)&repeats_); + + error_ = + _wrapper->clSetKernelArg(kernel2_, 0, sizeof(cl_mem), (void *)&inBuffer_); + error_ = _wrapper->clSetKernelArg(kernel2_, 1, sizeof(cl_mem), + (void *)&outBuffer_); + error_ = _wrapper->clSetKernelArg(kernel2_, 2, sizeof(cl_uint), + (void *)&bufSizeDW_); + error_ = _wrapper->clSetKernelArg(kernel2_, 3, sizeof(cl_uint), + (void *)&bufSizeDW_); + error_ = + _wrapper->clSetKernelArg(kernel2_, 4, sizeof(cl_uint), (void *)&zero); + error_ = _wrapper->clSetKernelArg(kernel2_, 5, sizeof(cl_int), (void *)&bMem); + error_ = + _wrapper->clSetKernelArg(kernel2_, 6, sizeof(cl_uint), (void *)&repeats_); + + setData(inBuffer_, (int)1.0f); +} + +void OCLPerfLDSLatency::run(void) { + int global = 1; + int local = 1; + + if (moreThreads) { + if (isAMD_) { + global *= 64; + local *= 64; + } else { + global *= 32; + local *= 32; + } + } + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + + // Warm-up + unsigned int warmup = 128; + error_ = + _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint), (void *)&warmup); + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint), + (void *)&bufSizeDW_); + _wrapper->clFinish(cmd_queue_); + + // Restore input buffer when finished as it may have been modified by RW test + setData(inBuffer_, (int)1.0f); + + CPerfCounter timer, timer2; + + timer.Reset(); + timer.Start(); + + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + + _wrapper->clFinish(cmd_queue_); + + timer.Stop(); + + checkData(outBuffer_); + + timer2.Reset(); + timer2.Start(); + + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel2_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + + _wrapper->clFinish(cmd_queue_); + + timer2.Stop(); + double sec = timer.GetElapsedTime() - timer2.GetElapsedTime(); + + // Read latency in ns + double perf = sec * (double)(1e09) / ((double)bufSizeDW_ * (double)repeats_); + + _perfInfo = (float)perf; + char buf[256]; + char buf2[32]; + buf2[0] = '\0'; + SNPRINTF(buf, sizeof(buf), "%10s %2d threads, %8d reads, %5d repeats (ns)", + buf2, global, bufSizeDW_, repeats_); + testDescString = buf; +} + +unsigned int OCLPerfLDSLatency::close(void) { + _wrapper->clFinish(cmd_queue_); + + if (inBuffer_) { + error_ = _wrapper->clReleaseMemObject(inBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(inBuffer_) failed"); + } + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (kernel2_) { + error_ = _wrapper->clReleaseKernel(kernel2_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSLatency.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSLatency.h new file mode 100644 index 0000000000..29eedfed79 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSLatency.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_LDSLATENCY_H_ +#define _OCL_LDSLATENCY_H_ + +#include "OCLTestImp.h" + +class OCLPerfLDSLatency : public OCLTestImp { + public: + OCLPerfLDSLatency(); + virtual ~OCLPerfLDSLatency(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void genShader(void); + void setData(cl_mem buffer, unsigned int data); + void checkData(cl_mem buffer); + + cl_context context_; + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + cl_kernel kernel2_; + cl_mem inBuffer_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int width_; + unsigned int bufSizeDW_; + unsigned int repeats_; + unsigned int maxSize_; + bool isAMD_; + bool moreThreads; +}; + +#endif // _OCL_LDSLATENCY_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.cpp new file mode 100644 index 0000000000..1bb9087b17 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.cpp @@ -0,0 +1,395 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfLDSReadSpeed.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 4 +// 256KB, 1 MB, 4MB, 16 MB +static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304, + 16777216}; + +void OCLPerfLDSReadSpeed::genShader(unsigned int idx) { + shader_.clear(); + if (idx == 0) { + shader_ += + "__kernel __attribute__((reqd_work_group_size(64,1,1))) void " + "_ldsReadSpeed(__global float *outBuf, float c)\n" + "{\n" + " uint gid = (int) get_global_id(0);\n" + " uint lid = (int) get_local_id(0);\n" + " __local float localLocal[2048];\n" + " float val1 = c;\n" + " float val2 = c;\n" + " float val3 = c;\n" + " float val4 = c;\n" + " uint hacklid = gid % 64;\n" + " for (int i = 0; i < (2048/64); i++) {\n" + " localLocal[hacklid + i*64] = lid;\n" + " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " val1 += localLocal[lid+0];\n" + " val2 += localLocal[lid+64];\n" + " val3 += localLocal[lid+128];\n" + " val4 += localLocal[lid+192];\n" + " val1 += localLocal[lid+256];\n" + " val2 += localLocal[lid+320];\n" + " val3 += localLocal[lid+384];\n" + " val4 += localLocal[lid+448];\n" + " val1 += localLocal[lid+512];\n" + " val2 += localLocal[lid+576];\n" + " val3 += localLocal[lid+640];\n" + " val4 += localLocal[lid+704];\n" + " val1 += localLocal[lid+768];\n" + " val2 += localLocal[lid+832];\n" + " val3 += localLocal[lid+896];\n" + " val4 += localLocal[lid+960];\n" + " val1 += localLocal[lid+1024];\n" + " val2 += localLocal[lid+1088];\n" + " val3 += localLocal[lid+1152];\n" + " val4 += localLocal[lid+1216];\n" + " val1 += localLocal[lid+1280];\n" + " val2 += localLocal[lid+1344];\n" + " val3 += localLocal[lid+1408];\n" + " val4 += localLocal[lid+1472];\n" + " val1 += localLocal[lid+1536];\n" + " val2 += localLocal[lid+1600];\n" + " val3 += localLocal[lid+1664];\n" + " val4 += localLocal[lid+1728];\n" + " val1 += localLocal[lid+1792];\n" + " val2 += localLocal[lid+1856];\n" + " val3 += localLocal[lid+1920];\n" + " val4 += localLocal[lid+1984];\n" + " outBuf[gid] = val1+val2+val3+val4;\n" + "}\n"; + ldsSizeBytes_ = 2048 * 4; + } else if (idx == 1) { + shader_ += + "__kernel __attribute__((reqd_work_group_size(64,1,1))) void " + "_ldsReadSpeed(__global float *outBuf, float c)\n" + "{\n" + " uint gid = (uint) get_global_id(0);\n" + " int lid = (int) get_local_id(0);\n" + " __local float localLocal[768];\n" + " float val0 = 0.0f;\n" + " float val1 = 0.0f;\n" + " uint hacklid = gid % 64;\n" + " for (int i = 0; i < (768/64); i++) {\n" + " localLocal[hacklid + i*64] = lid;\n" + " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + "#pragma nounroll\n" + "for (uint i = 0; i < 32;i++)\n" + "{\n" + " val0 += localLocal[lid+0];\n" + " val1 += localLocal[lid+64];\n" + " val0 += localLocal[lid+128];\n" + " val1 += localLocal[lid+192];\n" + " val0 += localLocal[lid+256];\n" + " val1 += localLocal[lid+320];\n" + " val0 += localLocal[lid+384];\n" + " val1 += localLocal[lid+448];\n" + " lid += 1;\n" + "}\n" + "val0 += val1;\n" + "val1 = min(val0,1.0f);\n" + "if ((lid + val1) < 0){\n" + " outBuf[gid] = val0;\n" + "}\n" + "}\n"; + ldsSizeBytes_ = 768 * 4; + } else { + shader_ += + "__kernel __attribute__((reqd_work_group_size(64,1,1))) void " + "_ldsReadSpeed(__global float *outBuf, float c)\n" + "{\n" + " uint gid = (uint) get_global_id(0);\n" + " int lid = (int) get_local_id(0);\n" + " __local float localLocal[256];\n" + " float val0 = 0.0f;\n" + " float val1 = 0.0f;\n" + " uint hacklid = gid % 64;\n" + " for (int i = 0; i < (256/64); i++) {\n" + " localLocal[hacklid + i*64] = lid;\n" + " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + "#pragma nounroll\n" + "for (uint i = 0; i < 32;i++)\n" + "{\n" + " val0 += localLocal[8*i+0];\n" + " val1 += localLocal[8*i+1];\n" + " val0 += localLocal[8*i+2];\n" + " val1 += localLocal[8*i+3];\n" + " val0 += localLocal[8*i+4];\n" + " val1 += localLocal[8*i+5];\n" + " val0 += localLocal[8*i+6];\n" + " val1 += localLocal[8*i+7];\n" + "}\n" + "val0 += val1;\n" + "val1 = min(val0,1.0f);\n" + "if ((lid + val1) < 0){\n" + " outBuf[gid] = val0;\n" + "}\n" + "}\n"; + ldsSizeBytes_ = 256 * 4; + } +} + +OCLPerfLDSReadSpeed::OCLPerfLDSReadSpeed() { _numSubTests = NUM_SIZES * 3; } + +OCLPerfLDSReadSpeed::~OCLPerfLDSReadSpeed() {} + +void OCLPerfLDSReadSpeed::setData(cl_mem buffer, float val) { + float *data = (float *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, + CL_MAP_WRITE, 0, bufSize_, + 0, NULL, NULL, &error_); + for (unsigned int i = 0; i < (bufSize_ >> 2); i++) data[i] = val; + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); + _wrapper->clFinish(cmd_queue_); +} + +void OCLPerfLDSReadSpeed::checkData(cl_mem buffer) { + float *data = (float *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, + CL_MAP_READ, 0, bufSize_, + 0, NULL, NULL, &error_); + for (unsigned int i = 0; i < (bufSize_ >> 2); i++) { + if (data[i] != (float)numReads_) { + printf("Data validation failed at index %d!\n", i); + printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, numReads_, + numReads_, numReads_, (unsigned int)data[i], + (unsigned int)data[i + 1], (unsigned int)data[i + 2], + (unsigned int)data[i + 3]); + CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n"); + break; + } + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); + _wrapper->clFinish(cmd_queue_); +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfLDSReadSpeed::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + + context_ = 0; + cmd_queue_ = 0; + program_ = 0; + kernel_ = 0; + outBuffer_ = 0; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + // if (num_devices > 0) + //{ + // platform = platforms[_platformIndex]; + // break; + //} +#if 0 + } +#endif + delete platforms; + } + + numReads_ = 32; + width_ = Sizes[test % NUM_SIZES]; + shaderIdx_ = test / NUM_SIZES; + + bufSize_ = width_; + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + genShader(shaderIdx_); + char *tmp = (char *)shader_.c_str(); + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&tmp, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &device, "", NULL, NULL); + + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "_ldsReadSpeed", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + + float foo = 0; + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer_); + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_float), (void *)&foo); + + setData(outBuffer_, 1.2345678f); +} + +void OCLPerfLDSReadSpeed::run(void) { + int global = bufSize_ / sizeof(cl_float); + int local = 64; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + + CPerfCounter timer; + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < NUM_ITER; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + } + _wrapper->clFinish(cmd_queue_); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + char buf[256]; + const char *buf2; + if (shaderIdx_ == 0) { + buf2 = " def kernel"; + } else if (shaderIdx_ == 1) { + buf2 = "SI friendly"; + numReads_ *= 8; + } else { + buf2 = " broadcast"; + numReads_ *= 8; + } + // LDS bandwidth in GB/s + // We have one extra write per LDS location to initialize LDS + double perf = + ((double)global * (numReads_ * sizeof(cl_float) + ldsSizeBytes_ / 64) * + NUM_ITER * (double)(1e-09)) / + sec; + + _perfInfo = (float)perf; + SNPRINTF(buf, sizeof(buf), " %s %8d threads, %3d reads (GB/s) ", buf2, global, + numReads_); + testDescString = buf; + // checkData(outBuffer_); +} + +unsigned int OCLPerfLDSReadSpeed::close(void) { + _wrapper->clFinish(cmd_queue_); + + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.h new file mode 100644 index 0000000000..3214cb471f --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfLDSReadSpeed.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_LDSReadSpeed_H_ +#define _OCL_LDSReadSpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerfLDSReadSpeed : public OCLTestImp { + public: + OCLPerfLDSReadSpeed(); + virtual ~OCLPerfLDSReadSpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void genShader(unsigned int idx); + void setData(cl_mem buffer, float data); + void checkData(cl_mem buffer); + + static const unsigned int NUM_ITER = 100; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int width_; + unsigned int bufSize_; + unsigned int vecSizeIdx_; + unsigned int numReads_; + unsigned int shaderIdx_; + unsigned int ldsSizeBytes_; +}; + +#endif // _OCL_LDSReadSpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.cpp new file mode 100644 index 0000000000..220ddf430c --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.cpp @@ -0,0 +1,940 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfMandelbrot.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +typedef struct { + double x; + double y; + double width; +} coordRec; + +coordRec coords[] = { + {0.0, 0.0, 4.0}, // Whole set + {0.0, 0.0, 0.00001}, // All black + {-0.0180789661868, 0.6424294066162, 0.00003824140}, // Hit detail +}; + +static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); + +static const char *float_mandel = + "__kernel void mandelbrot(__global uint *out, uint width, float xPos, " + "float yPos, float xStep, float yStep, uint maxIter)\n" + "{\n" + " int tid = get_global_id(0);\n" + " int i = tid % width;\n" + " int j = tid / width;\n" + " float x0 = (float)(xPos + xStep*i);\n" + " float y0 = (float)(yPos + yStep*j);\n" + "\n" + " float x = x0;\n" + " float y = y0;\n" + "\n" + " uint iter = 0;\n" + " float tmp;\n" + " for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++)\n" + " {\n" + " tmp = x;\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + " }\n" + " out[tid] = iter;\n" + "}\n"; + +static const char *float_mandel_vec = + "__kernel void mandelbrot(__global uint *out, uint width, float xPos, " + "float yPos, float xStep, float yStep, uint maxIter)\n" + "{\n" + " int tid = get_global_id(0);\n" + " int i = tid % (width/4);\n" + " int j = tid / (width/4);\n" + " int4 veci = (int4)(4*i, 4*i+1, 4*i+2, 4*i+3);\n" + " int4 vecj = (int4)(j, j, j, j);\n" + " float4 x0;\n" + " x0.s0 = (float)(xPos + xStep*veci.s0);\n" + " x0.s1 = (float)(xPos + xStep*veci.s1);\n" + " x0.s2 = (float)(xPos + xStep*veci.s2);\n" + " x0.s3 = (float)(xPos + xStep*veci.s3);\n" + " float4 y0;\n" + " y0.s0 = (float)(yPos + yStep*vecj.s0);\n" + " y0.s1 = (float)(yPos + yStep*vecj.s1);\n" + " y0.s2 = (float)(yPos + yStep*vecj.s2);\n" + " y0.s3 = (float)(yPos + yStep*vecj.s3);\n" + "\n" + " float4 x = x0;\n" + " float4 y = y0;\n" + "\n" + " uint iter = 0;\n" + " float4 tmp;\n" + " int4 stay;\n" + " int4 ccount = 0;\n" + " float4 savx = x;\n" + " float4 savy = y;\n" + " stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n" + " for (iter = 0; (stay.s0 | stay.s1 | stay.s2 | stay.s3) && (iter < " + "maxIter); iter+=16)\n" + " {\n" + " x = savx;\n" + " y = savy;\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n" + " savx = select(savx,x,stay);\n" + " savy = select(savy,y,stay);\n" + " ccount -= stay*16;\n" + " }\n" + " // Handle remainder\n" + " if (!(stay.s0 & stay.s1 & stay.s2 & stay.s3))\n" + " {\n" + " iter = 16;\n" + " do\n" + " {\n" + " x = savx;\n" + " y = savy;\n" + " // More efficient to use scalar ops here: Why?\n" + " stay.s0 = ((x.s0*x.s0+y.s0*y.s0) <= 4.0f) && (ccount.s0 < " + "maxIter);\n" + " stay.s1 = ((x.s1*x.s1+y.s1*y.s1) <= 4.0f) && (ccount.s1 < " + "maxIter);\n" + " stay.s2 = ((x.s2*x.s2+y.s2*y.s2) <= 4.0f) && (ccount.s2 < " + "maxIter);\n" + " stay.s3 = ((x.s3*x.s3+y.s3*y.s3) <= 4.0f) && (ccount.s3 < " + "maxIter);\n" + " tmp = x;\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + " ccount += stay;\n" + " iter--;\n" + " savx.s0 = (stay.s0 ? x.s0 : savx.s0);\n" + " savx.s1 = (stay.s1 ? x.s1 : savx.s1);\n" + " savx.s2 = (stay.s2 ? x.s2 : savx.s2);\n" + " savx.s3 = (stay.s3 ? x.s3 : savx.s3);\n" + " savy.s0 = (stay.s0 ? y.s0 : savy.s0);\n" + " savy.s1 = (stay.s1 ? y.s1 : savy.s1);\n" + " savy.s2 = (stay.s2 ? y.s2 : savy.s2);\n" + " savy.s3 = (stay.s3 ? y.s3 : savy.s3);\n" + " } while ((stay.s0 | stay.s1 | stay.s2 | stay.s3) && iter);\n" + " }\n" + " __global uint4 *vecOut = (__global uint4 *)out;\n" + " vecOut[tid] = convert_uint4(ccount);\n" + "}\n"; + +static const char *float_mandel_unroll = + "__kernel void mandelbrot(__global uint *out, uint width, float xPos, " + "float yPos, float xStep, float yStep, uint maxIter)\n" + "{\n" + " int tid = get_global_id(0);\n" + " int i = tid % width;\n" + " int j = tid / width;\n" + " float x0 = (float)(xPos + xStep*(float)i);\n" + " float y0 = (float)(yPos + yStep*(float)j);\n" + "\n" + " float x = x0;\n" + " float y = y0;\n" + "\n" + "#define FAST\n" + " uint iter = 0;\n" + " float tmp;\n" + " int stay;\n" + " int ccount = 0;\n" + " stay = (x*x+y*y) <= 4.0;\n" + " float savx = x;\n" + " float savy = y;\n" + "#ifdef FAST\n" + " for (iter = 0; (iter < maxIter); iter+=16)\n" + "#else\n" + " for (iter = 0; stay && (iter < maxIter); iter+=16)\n" + "#endif\n" + " {\n" + " x = savx;\n" + " y = savy;\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " stay = (x*x+y*y) <= 4.0;\n" + " savx = select(savx,x,stay);\n" + " savy = select(savy,y,stay);\n" + " ccount += stay*16;\n" + "#ifdef FAST\n" + " if (!stay)\n" + " break;\n" + "#endif\n" + " }\n" + " // Handle remainder\n" + " if (!stay)\n" + " {\n" + " iter = 16;\n" + " do\n" + " {\n" + " x = savx;\n" + " y = savy;\n" + " stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);\n" + " tmp = x;\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + " ccount += stay;\n" + " iter--;\n" + " savx = select(savx,x,stay);\n" + " savy = select(savy,y,stay);\n" + " } while (stay && iter);\n" + " }\n" + " out[tid] = (uint)ccount;\n" + "}\n"; + +static const char *double_mandel = + "#ifdef USE_CL_AMD_FP64\n" + "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n" + "#endif\n" + "#ifdef USE_CL_KHR_FP64\n" + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" + "#endif\n" + "__kernel void mandelbrot(__global uint *out, uint width, double xPos, " + "double yPos, double xStep, double yStep, uint maxIter)\n" + "{\n" + " int tid = get_global_id(0);\n" + " int i = tid % width;\n" + " int j = tid / width;\n" + " double x0 = (double)(xPos + xStep*i);\n" + " double y0 = (double)(yPos + yStep*j);\n" + "\n" + " double x = x0;\n" + " double y = y0;\n" + "\n" + " uint iter = 0;\n" + " double tmp;\n" + " for (iter = 0; (x*x + y*y <= 4.0) && (iter < maxIter); iter++)\n" + " {\n" + " tmp = x;\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + " }\n" + " out[tid] = iter;\n" + "}\n"; + +static const char *double_mandel_unroll = + "#ifdef USE_CL_AMD_FP64\n" + "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n" + "#endif\n" + "#ifdef USE_CL_KHR_FP64\n" + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" + "#endif\n" + "__kernel void mandelbrot(__global uint *out, uint width, double xPos, " + "double yPos, double xStep, double yStep, uint maxIter)\n" + "{\n" + " int tid = get_global_id(0);\n" + " int i = tid % width;\n" + " int j = tid / width;\n" + " double x0 = (double)(xPos + xStep*(double)i);\n" + " double y0 = (double)(yPos + yStep*(double)j);\n" + "\n" + " double x = x0;\n" + " double y = y0;\n" + "\n" + "#define FAST\n" + " uint iter = 0;\n" + " double tmp;\n" + " int stay;\n" + " int ccount = 0;\n" + " stay = (x*x+y*y) <= 4.0;\n" + " double savx = x;\n" + " double savy = y;\n" + "#ifdef FAST\n" + " for (iter = 0; (iter < maxIter); iter+=16)\n" + "#else\n" + " for (iter = 0; stay && (iter < maxIter); iter+=16)\n" + "#endif\n" + " {\n" + " x = savx;\n" + " y = savy;\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " // Two iterations\n" + " tmp = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*x,y,y0);\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(tmp,tmp,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + "\n" + " stay = (x*x+y*y) <= 4.0;\n" + " savx = (stay ? x : savx);//select(savx,x,stay);\n" + " savy = (stay ? y : savy);//select(savy,y,stay);\n" + " ccount += stay*16;\n" + "#ifdef FAST\n" + " if (!stay)\n" + " break;\n" + "#endif\n" + " }\n" + " // Handle remainder\n" + " if (!stay)\n" + " {\n" + " iter = 16;\n" + " do\n" + " {\n" + " x = savx;\n" + " y = savy;\n" + " stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);\n" + " tmp = x;\n" + " x = MUL_ADD_INS(-y,y,MUL_ADD_INS(x,x,x0));\n" + " y = MUL_ADD_INS(2.0f*tmp,y,y0);\n" + " ccount += stay;\n" + " iter--;\n" + " savx = (stay ? x : savx);//select(savx,x,stay);\n" + " savy = (stay ? y : savy);//select(savy,y,stay);\n" + " } while (stay && iter);\n" + " }\n" + " out[tid] = (uint)ccount;\n" + "}\n"; + +static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15; + +// Expected results for each kernel run at each coord +unsigned long long expectedIters[] = { + 203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull, + 120254651ull, 203277748ull, 2147483648ull, 120254651ull, 203315114ull, + 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull, + 203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull, + 120485704ull, 203280620ull, 2147483648ull, 120485704ull, 203315114ull, + 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull}; + +// nvidia supports CL_KHR_FP64, so they get better results for doubles. Not +// sure why we differ in floats though +unsigned long long expectedItersNV[] = { + 203277748ull, 2147483648ull, 120254651ull, 203277748ull, + 2147483648ull, 120254651ull, 203277748ull, 2147483648ull, + 120254651ull, 203315226ull, 2147483648ull, 120091921ull, + 203315226ull, 2147483648ull, 120091921ull, // end of mad + 203280620ull, 2147483648ull, 120485704ull, 203280620ull, + 2147483648ull, 120485704ull, 203280620ull, 2147483648ull, + 120485704ull, 203315114ull, 2147483648ull, 120042599ull, + 203315114ull, 2147483648ull, 120042599ull}; + +const char *shaderStr[] = {" float_mad", " float_vector_mad", + " float_unroll_mad", " double_mad", + "double_unroll_mad", " float_fma", + " float_vector_fma", " float_unroll_fma", + " double_fma", "double_unroll_fma"}; + +OCLPerfMandelbrot::OCLPerfMandelbrot() { _numSubTests = 10 * numCoords; } + +OCLPerfMandelbrot::~OCLPerfMandelbrot() {} + +void OCLPerfMandelbrot::setData(cl_mem buffer, unsigned int val) { + unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL, + &error_); + for (unsigned int i = 0; i < width_ * width_; i++) data[i] = val; + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); +} + +void OCLPerfMandelbrot::checkData(cl_mem buffer) { + unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL, + &error_); + for (unsigned int i = 0; i < width_ * width_; i++) { + totalIters += data[i]; + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfMandelbrot::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + skip = false; + totalIters = 0; + isAMD = false; + + context_ = 0; + cmd_queue_ = 0; + program_ = 0; + kernel_ = 0; + outBuffer_ = 0; + + // Maximum iteration count + // NOTE: Some kernels are unrolled 16 times, so make sure maxIter is divisible + // by 16 NOTE: Can increase to get better peak performance numbers, but be + // sure not to TDR slow ASICs! + unsigned int maxIter = 32768; + + // NOTE: Width needs to be divisible by 4 because the float_mandel_vec kernel + // processes 4 pixels at once NOTE: Can increase to get better peak + // performance numbers, but be sure not to TDR slow ASICs! + width_ = 256; + + // We compute a square domain + bufSize_ = width_ * width_ * sizeof(cl_uint); + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + // Get last for default +#if 0 + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); +#if 0 + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + platform = platforms[i]; + break; + } +#endif + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + isAMD = true; + } + platform = platforms[_platformIndex]; + } +#if 0 + } +#endif + delete platforms; + } + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, + "Couldn't find platform with GPU devices, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + char charbuf[1024]; + size_t retsize; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024, + charbuf, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + doubleSupport = false; + + char *p = strstr(charbuf, "cl_amd_fp64"); + char *p2 = strstr(charbuf, "cl_khr_fp64"); + + if (p || p2) + doubleSupport = true; + else + doubleSupport = false; + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + const char *tmp; + shaderIdx = _openTest / numCoords; + if ((doubleSupport != true) && ((shaderIdx == 3) || (shaderIdx == 4) || + (shaderIdx == 8) || (shaderIdx == 9))) { + // We don't support doubles, so skip those tests + skip = true; + _perfInfo = 0.0f; + return; + } + + if (shaderIdx == 0 || shaderIdx == 5) { + tmp = float_mandel; + } else if (shaderIdx == 1 || shaderIdx == 6) { + tmp = float_mandel_vec; + } else if (shaderIdx == 2 || shaderIdx == 7) { + tmp = float_mandel_unroll; + } else if (shaderIdx == 3 || shaderIdx == 8) { + tmp = double_mandel; + } else { + tmp = double_mandel_unroll; + } + std::string curr(tmp); + std::string searchString("MUL_ADD_INS"); + std::string replaceString; + if (shaderIdx < 5) { + replaceString = "mad"; + } else { + replaceString = "fma"; + } + + std::string::size_type pos = 0; + while ((pos = curr.find(searchString, pos)) != std::string::npos) { + curr.replace(pos, searchString.size(), replaceString); + pos++; + } + + tmp = curr.c_str(); + + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&tmp, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + const char *buildOps = NULL; + if (p) + buildOps = "-DUSE_CL_AMD_FP64"; + else if (p2) + buildOps = "-DUSE_CL_KHR_FP64"; + error_ = _wrapper->clBuildProgram(program_, 1, &device, buildOps, NULL, NULL); + + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "mandelbrot", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + + coordIdx = _openTest % numCoords; + if ((shaderIdx == 0) || (shaderIdx == 1) || (shaderIdx == 2) || + (shaderIdx == 5) || (shaderIdx == 6) || (shaderIdx == 7)) { + float xStep = (float)(coords[coordIdx].width / (double)width_); + float yStep = (float)(-coords[coordIdx].width / (double)width_); + float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); + float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), + (void *)&outBuffer_); + error_ = + _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_uint), (void *)&width_); + error_ = + _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_float), (void *)&xPos); + error_ = + _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_float), (void *)&yPos); + error_ = + _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_float), (void *)&xStep); + error_ = + _wrapper->clSetKernelArg(kernel_, 5, sizeof(cl_float), (void *)&yStep); + error_ = + _wrapper->clSetKernelArg(kernel_, 6, sizeof(cl_uint), (void *)&maxIter); + } else { + double xStep = coords[coordIdx].width / (double)width_; + double yStep = -coords[coordIdx].width / (double)width_; + double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width; + double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width; + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), + (void *)&outBuffer_); + error_ = + _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_uint), (void *)&width_); + error_ = + _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_double), (void *)&xPos); + error_ = + _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_double), (void *)&yPos); + error_ = + _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_double), (void *)&xStep); + error_ = + _wrapper->clSetKernelArg(kernel_, 5, sizeof(cl_double), (void *)&yStep); + error_ = + _wrapper->clSetKernelArg(kernel_, 6, sizeof(cl_uint), (void *)&maxIter); + } + setData(outBuffer_, 0xdeadbeef); +} + +void OCLPerfMandelbrot::run(void) { + if (skip) return; + int global = width_ * width_; + // We handle 4 pixels per thread + if ((shaderIdx == 1) || (shaderIdx == 6)) global >>= 2; + int local = 64; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + + // Warm-up + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + _wrapper->clFinish(cmd_queue_); + + double totalTime = 0.0; + + for (unsigned int k = 0; k < numLoops; k++) { + CPerfCounter timer; + + timer.Reset(); + timer.Start(); + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + _wrapper->clFinish(cmd_queue_); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + totalTime += sec; + } + + checkData(outBuffer_); + // Compute GFLOPS. There are 7 FLOPs per iteration + double perf = ((double)totalIters * 7 * (double)(1e-09)) / + (totalTime / (double)numLoops); + + _perfInfo = (float)perf; + char buf[256]; + SNPRINTF(buf, sizeof(buf), " %s (GFLOPS) ", shaderStr[shaderIdx]); + testDescString = buf; + // Dump iteration count + // printf(" totalIter = %lld\n", totalIters); + if (isAMD && (type_ == CL_DEVICE_TYPE_GPU)) { + CHECK_RESULT((totalIters != expectedIters[_openTest]) && + (totalIters != + expectedIters[(_openTest < FMA_EXPECTEDVALUES_INDEX + ? _openTest + FMA_EXPECTEDVALUES_INDEX + : _openTest)]), + "Incorrect iteration count detected!"); + } else { + CHECK_RESULT(totalIters != expectedItersNV[_openTest], + "Incorrect iteration count detected!"); + } +} + +unsigned int OCLPerfMandelbrot::close(void) { + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} + +OCLPerfAsyncMandelbrot::OCLPerfAsyncMandelbrot() {} + +OCLPerfAsyncMandelbrot::~OCLPerfAsyncMandelbrot() {} + +void OCLPerfAsyncMandelbrot::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + // Create common items first + OCLPerfMandelbrot::open(test, units, conversion, deviceId); + + // Create resources for async test + cmd_queue2_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue2_ == 0, "clCreateCommandQueue failed"); + + outBuffer2_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer2) failed"); +} + +void OCLPerfAsyncMandelbrot::run(void) { + if (skip) return; + int global = width_ * width_; + // We handle 4 pixels per thread + if ((shaderIdx == 1) || (shaderIdx == 6)) global >>= 2; + int local = 64; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + + // Warm-up + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + _wrapper->clFinish(cmd_queue_); + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), + (void *)&outBuffer2_); + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue2_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + _wrapper->clFinish(cmd_queue2_); + + double totalTime = 0.0; + + for (unsigned int k = 0; k < numLoops; k++) { + CPerfCounter timer; + + timer.Reset(); + timer.Start(); + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), + (void *)&outBuffer_); + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), + (void *)&outBuffer2_); + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue2_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + _wrapper->clFlush(cmd_queue_); + _wrapper->clFlush(cmd_queue2_); + _wrapper->clFinish(cmd_queue_); + _wrapper->clFinish(cmd_queue2_); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + totalTime += sec; + } + + checkData(outBuffer_); + checkData(outBuffer2_); + // Compute GFLOPS. There are 7 FLOPs per iteration + double perf = ((double)(totalIters * 7) * (double)(1e-09)) / + (totalTime / (double)numLoops); + + _perfInfo = (float)perf; + char buf[256]; + SNPRINTF(buf, sizeof(buf), " async %s (GFLOPS) ", shaderStr[shaderIdx]); + testDescString = buf; + // Dump iteration count + // printf(" totalIter = %lld\n", totalIters); + if (isAMD && (type_ == CL_DEVICE_TYPE_GPU)) { + CHECK_RESULT( + (totalIters != 2 * expectedIters[_openTest]) && + (totalIters != + 2 * expectedIters[(_openTest < FMA_EXPECTEDVALUES_INDEX + ? _openTest + FMA_EXPECTEDVALUES_INDEX + : _openTest)]), + "Incorrect iteration count detected!"); + } else { + CHECK_RESULT(totalIters != 2 * expectedItersNV[_openTest], + "Incorrect iteration count detected!"); + } +} + +unsigned int OCLPerfAsyncMandelbrot::close(void) { + _wrapper->clFinish(cmd_queue_); + _wrapper->clFinish(cmd_queue2_); + + // Clean up async test items + if (outBuffer2_) { + error_ = _wrapper->clReleaseMemObject(outBuffer2_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer2_) failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue2_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + // Clean up the rest + return OCLPerfMandelbrot::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.h new file mode 100644 index 0000000000..f810801038 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMandelbrot.h @@ -0,0 +1,75 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_Mandelbrot_H_ +#define _OCL_Mandelbrot_H_ + +#include "OCLTestImp.h" + +class OCLPerfMandelbrot : public OCLTestImp { + public: + OCLPerfMandelbrot(); + virtual ~OCLPerfMandelbrot(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void setData(cl_mem buffer, unsigned int data); + void checkData(cl_mem buffer); + + cl_context context_; + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + cl_mem outBuffer_; + cl_int error_; + cl_device_id device; + + unsigned int width_; + unsigned int bufSize_; + bool doubleSupport; + bool skip; + unsigned int maxIter; + unsigned int shaderIdx; + unsigned int coordIdx; + unsigned long long totalIters; + bool isAMD; + static const unsigned int numLoops = 10; +}; + +class OCLPerfAsyncMandelbrot : public OCLPerfMandelbrot { + public: + OCLPerfAsyncMandelbrot(); + virtual ~OCLPerfAsyncMandelbrot(); + + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + cl_command_queue cmd_queue2_; + cl_mem outBuffer2_; +}; + +#endif // _OCL_Mandelbrot_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferReadSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferReadSpeed.cpp new file mode 100644 index 0000000000..74618f5b46 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferReadSpeed.cpp @@ -0,0 +1,262 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfMapBufferReadSpeed.h" + +#include +#include +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 4 +// 256KB, 1 MB, 4MB, 16 MB +static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304, + 16777216}; + +static const unsigned int Iterations[2] = {1, + OCLPerfMapBufferReadSpeed::NUM_ITER}; +#define NUM_OFFSETS 1 +static const unsigned int offsets[NUM_OFFSETS] = {0}; +#define NUM_SUBTESTS (3 + NUM_OFFSETS) +OCLPerfMapBufferReadSpeed::OCLPerfMapBufferReadSpeed() { + _numSubTests = NUM_SIZES * NUM_SUBTESTS * 2; +} + +OCLPerfMapBufferReadSpeed::~OCLPerfMapBufferReadSpeed() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfMapBufferReadSpeed::open(unsigned int test, char *units, + double &conversion, + unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + outBuffer_ = 0; + persistent = false; + allocHostPtr = false; + useHostPtr = false; + hostMem = NULL; + alignedMem = NULL; + alignment = 4096; + isAMD = false; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + isAMD = true; + } + // platform = platforms[_platformIndex]; + // break; + } +#if 0 + } +#endif + delete platforms; + } + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + bufSize_ = Sizes[_openTest % NUM_SIZES]; + if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 2) { + useHostPtr = true; + offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 3]; + } else if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 2) && isAMD) { + persistent = true; + } else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 1) { + allocHostPtr = true; + } + + numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS)]; + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_mem_flags flags = CL_MEM_WRITE_ONLY; + if (persistent) { + flags |= CL_MEM_USE_PERSISTENT_MEM_AMD; + } else if (allocHostPtr) { + flags |= CL_MEM_ALLOC_HOST_PTR; + } else if (useHostPtr) { + flags |= CL_MEM_USE_HOST_PTR; + hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset); + CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed"); + alignedMem = + (char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) + + offset); + } + outBuffer_ = + _wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + // Force memory to be on GPU, if possible + { + cl_mem memBuffer = + _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed"); + + _wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, outBuffer_, 0, 0, + bufSize_, 0, NULL, NULL); + _wrapper->clFinish(cmd_queue_); + + _wrapper->clReleaseMemObject(memBuffer); + } +} + +void OCLPerfMapBufferReadSpeed::run(void) { + CPerfCounter timer; + + void *mem; + // Warm up + mem = + _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE, CL_MAP_READ, + 0, bufSize_, 0, NULL, NULL, &error_); + + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed"); + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < numIter; i++) { + mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE, + CL_MAP_READ, 0, bufSize_, 0, NULL, NULL, + &error_); + + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed"); + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + } + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Map read bandwidth in GB/s + double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec; + + if (persistent || allocHostPtr) { + _perfInfo = (float)(sec / numIter) * 1000000.0f; // Get us per map + } else { + _perfInfo = (float)perf; + } + char str[256]; + if (persistent) { + SNPRINTF(str, sizeof(str), "PERSISTENT (us)"); + } else if (allocHostPtr) { + SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (us)"); + } else if (useHostPtr) { + SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset); + } else { + SNPRINTF(str, sizeof(str), "(GB/s)"); + } + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%8d bytes) i: %4d %29s ", bufSize_, numIter, + str); + testDescString = buf; +} + +unsigned int OCLPerfMapBufferReadSpeed::close(void) { + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + if (hostMem) { + free(hostMem); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferReadSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferReadSpeed.h new file mode 100644 index 0000000000..4017061d79 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferReadSpeed.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_MapBufferReadSpeed_H_ +#define _OCL_MapBufferReadSpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerfMapBufferReadSpeed : public OCLTestImp { + public: + OCLPerfMapBufferReadSpeed(); + virtual ~OCLPerfMapBufferReadSpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 1000; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int bufSize_; + bool persistent; + bool allocHostPtr; + bool useHostPtr; + unsigned int numIter; + char* hostMem; + char* alignedMem; + size_t alignment; + unsigned int offset; + bool isAMD; +}; + +#endif // _OCL_MapBufferReadSpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferWriteSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferWriteSpeed.cpp new file mode 100644 index 0000000000..dd12ded6d4 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferWriteSpeed.cpp @@ -0,0 +1,291 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfMapBufferWriteSpeed.h" + +#include +#include +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 4 +// 256KB, 1 MB, 4MB, 16 MB +static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304, + 16777216}; + +static const unsigned int Iterations[2] = { + 1, OCLPerfMapBufferWriteSpeed::NUM_ITER}; +#define NUM_OFFSETS 1 +static const unsigned int offsets[NUM_OFFSETS] = {0}; +#define NUM_SUBTESTS (3 + NUM_OFFSETS) +OCLPerfMapBufferWriteSpeed::OCLPerfMapBufferWriteSpeed() { + _numSubTests = NUM_SIZES * NUM_SUBTESTS * 3; +} + +OCLPerfMapBufferWriteSpeed::~OCLPerfMapBufferWriteSpeed() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfMapBufferWriteSpeed::open(unsigned int test, char *units, + double &conversion, + unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + outBuffer_ = 0; + persistent = false; + allocHostPtr = false; + useHostPtr = false; + hostMem = NULL; + alignedMem = NULL; + alignment = 4096; + isAMD = false; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + isAMD = true; + } + // platform = platforms[_platformIndex]; + // break; + } +#if 0 + } +#endif + delete platforms; + } + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + char getVersion[128]; + error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION, + sizeof(getVersion), getVersion, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed"); + platformVersion[0] = getVersion[7]; + platformVersion[1] = getVersion[8]; + platformVersion[2] = getVersion[9]; + platformVersion[3] = '\0'; + + bufSize_ = Sizes[_openTest % NUM_SIZES]; + if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 2) { + useHostPtr = true; + offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 3]; + } else if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 2) && isAMD) { + persistent = true; + } else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 1) { + allocHostPtr = true; + } + + numIter = Iterations[std::min(_openTest / (NUM_SIZES * NUM_SUBTESTS), 1u)]; + + if (_openTest < NUM_SIZES * NUM_SUBTESTS * 2) { + mapFlags = CL_MAP_WRITE; + } else { + mapFlags = CL_MAP_WRITE_INVALIDATE_REGION; + } + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_mem_flags flags = CL_MEM_READ_ONLY; + if (persistent) { + flags |= CL_MEM_USE_PERSISTENT_MEM_AMD; + } else if (allocHostPtr) { + flags |= CL_MEM_ALLOC_HOST_PTR; + } else if (useHostPtr) { + flags |= CL_MEM_USE_HOST_PTR; + hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset); + CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed"); + alignedMem = + (char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) + + offset); + } + outBuffer_ = + _wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + // Force memory to be on GPU if possible + { + cl_mem memBuffer = + _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed"); + + _wrapper->clEnqueueCopyBuffer(cmd_queue_, outBuffer_, memBuffer, 0, 0, + bufSize_, 0, NULL, NULL); + _wrapper->clFinish(cmd_queue_); + + _wrapper->clReleaseMemObject(memBuffer); + } +} + +void OCLPerfMapBufferWriteSpeed::run(void) { + CPerfCounter timer; + + if (_openTest >= NUM_SIZES * NUM_SUBTESTS * 2) { + // Skip CL_MAP_WRITE_INVALIDATE_REGION testing for 1.0 and 1.1 platforms + if ((platformVersion[0] == '1') && + ((platformVersion[2] == '0') || (platformVersion[2] == '1'))) { + char buf[256]; + SNPRINTF(buf, sizeof(buf), " SKIPPED "); + testDescString = buf; + return; + } + } + void *mem; + // Warm up + mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE, mapFlags, + 0, bufSize_, 0, NULL, NULL, &error_); + + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed"); + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < numIter; i++) { + mem = + _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE, mapFlags, + 0, bufSize_, 0, NULL, NULL, &error_); + + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed"); + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + } + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Map write bandwidth in GB/s + double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec; + + if (persistent || allocHostPtr) { + _perfInfo = (float)(sec / numIter) * 1000000.0f; // Get us per map + } else { + _perfInfo = (float)perf; + } + char str[256]; + if (persistent) { + SNPRINTF(str, sizeof(str), "PERSISTENT (us)"); + } else if (allocHostPtr) { + SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (us)"); + } else if (useHostPtr) { + SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset); + } else { + SNPRINTF(str, sizeof(str), "(GB/s)"); + } + char str2[256]; + if (mapFlags == CL_MAP_WRITE_INVALIDATE_REGION) { + SNPRINTF(str2, sizeof(str2), "INV_REG %29s", str); + } else { + SNPRINTF(str2, sizeof(str2), "%29s", str); + } + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%8d bytes) i: %4d %37s ", bufSize_, numIter, + str2); + testDescString = buf; +} + +unsigned int OCLPerfMapBufferWriteSpeed::close(void) { + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + if (hostMem) { + free(hostMem); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferWriteSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferWriteSpeed.h new file mode 100644 index 0000000000..ebcf8dc7d4 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapBufferWriteSpeed.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_MapBufferWriteSpeed_H_ +#define _OCL_MapBufferWriteSpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerfMapBufferWriteSpeed : public OCLTestImp { + public: + OCLPerfMapBufferWriteSpeed(); + virtual ~OCLPerfMapBufferWriteSpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 1000; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int bufSize_; + bool persistent; + bool allocHostPtr; + bool useHostPtr; + unsigned int numIter; + char* hostMem; + char* alignedMem; + size_t alignment; + unsigned int offset; + bool isAMD; + cl_map_flags mapFlags; + char platformVersion[32]; +}; + +#endif // _OCL_MapBufferWriteSpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageReadSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageReadSpeed.cpp new file mode 100644 index 0000000000..16572c5156 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageReadSpeed.cpp @@ -0,0 +1,213 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfMapImageReadSpeed.h" + +#include +#include +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 4 +static const unsigned int Sizes[NUM_SIZES] = {256, 512, 1024, 2048}; + +#define NUM_FORMATS 1 +static const cl_image_format formats[NUM_FORMATS] = { + {CL_RGBA, CL_UNSIGNED_INT8}}; +static const char *textFormats[NUM_FORMATS] = {"R8G8B8A8"}; +static const unsigned int formatSize[NUM_FORMATS] = {4}; + +static const unsigned int Iterations[2] = {1, + OCLPerfMapImageReadSpeed::NUM_ITER}; + +OCLPerfMapImageReadSpeed::OCLPerfMapImageReadSpeed() { + _numSubTests = NUM_SIZES * NUM_FORMATS * 2; +} + +OCLPerfMapImageReadSpeed::~OCLPerfMapImageReadSpeed() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfMapImageReadSpeed::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint typeOfDevice = type_; + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + outBuffer_ = 0; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice, + 0, NULL, &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + // if (num_devices > 0) + //{ + // platform = platforms[_platformIndex]; + // break; + //} +#if 0 + } +#endif + delete platforms; + } + + bufSize_ = Sizes[_openTest % NUM_SIZES]; + bufnum_ = (_openTest / NUM_SIZES) % NUM_FORMATS; + numIter = Iterations[_openTest / (NUM_SIZES * NUM_FORMATS)]; + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices, + devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_mem_flags flags = CL_MEM_WRITE_ONLY; + outBuffer_ = _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], + bufSize_, bufSize_, 0, NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateImage(outBuffer) failed"); +} + +void OCLPerfMapImageReadSpeed::run(void) { + CPerfCounter timer; + void *mem; + + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {bufSize_, bufSize_, 1}; + size_t image_row_pitch; + size_t image_slice_pitch; + // Warm up + mem = _wrapper->clEnqueueMapImage( + cmd_queue_, outBuffer_, CL_TRUE, CL_MAP_READ, origin, region, + &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_); + + CHECK_RESULT(error_, "clEnqueueMapImage failed"); + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed"); + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < numIter; i++) { + mem = _wrapper->clEnqueueMapImage( + cmd_queue_, outBuffer_, CL_TRUE, CL_MAP_READ, origin, region, + &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_); + + CHECK_RESULT(error_, "clEnqueueMapImage failed"); + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed"); + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + } + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Image map read bandwidth in GB/s + double perf = ((double)bufSize_ * bufSize_ * formatSize[bufnum_] * numIter * + (double)(1e-09)) / + sec; + + _perfInfo = (float)perf; + + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s i: %4d (GB/s) ", bufSize_, + bufSize_, textFormats[bufnum_], numIter); + testDescString = buf; +} + +unsigned int OCLPerfMapImageReadSpeed::close(void) { + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageReadSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageReadSpeed.h new file mode 100644 index 0000000000..509075fc41 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageReadSpeed.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_MapImageReadSpeed_H_ +#define _OCL_MapImageReadSpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerfMapImageReadSpeed : public OCLTestImp { + public: + OCLPerfMapImageReadSpeed(); + virtual ~OCLPerfMapImageReadSpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 100; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int bufSize_; + unsigned int bufnum_; + unsigned int numIter; +}; + +#endif // _OCL_MapImageReadSpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageWriteSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageWriteSpeed.cpp new file mode 100644 index 0000000000..9c0a4bd87e --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageWriteSpeed.cpp @@ -0,0 +1,214 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfMapImageWriteSpeed.h" + +#include +#include +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 4 +static const unsigned int Sizes[NUM_SIZES] = {256, 512, 1024, 2048}; + +#define NUM_FORMATS 1 +static const cl_image_format formats[NUM_FORMATS] = { + {CL_RGBA, CL_UNSIGNED_INT8}}; +static const char *textFormats[NUM_FORMATS] = {"R8G8B8A8"}; +static const unsigned int formatSize[NUM_FORMATS] = {4}; + +static const unsigned int Iterations[2] = {1, + OCLPerfMapImageWriteSpeed::NUM_ITER}; + +OCLPerfMapImageWriteSpeed::OCLPerfMapImageWriteSpeed() { + _numSubTests = NUM_SIZES * NUM_FORMATS * 2; +} + +OCLPerfMapImageWriteSpeed::~OCLPerfMapImageWriteSpeed() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfMapImageWriteSpeed::open(unsigned int test, char *units, + double &conversion, + unsigned int deviceId) { + cl_uint typeOfDevice = type_; + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + outBuffer_ = 0; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice, + 0, NULL, &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + // if (num_devices > 0) + //{ + // platform = platforms[_platformIndex]; + // break; + //} +#if 0 + } +#endif + delete platforms; + } + + bufSize_ = Sizes[_openTest % NUM_SIZES]; + bufnum_ = (_openTest / NUM_SIZES) % NUM_FORMATS; + numIter = Iterations[_openTest / (NUM_SIZES * NUM_FORMATS)]; + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices, + devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_mem_flags flags = CL_MEM_READ_ONLY; + outBuffer_ = _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], + bufSize_, bufSize_, 0, NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateImage(outBuffer) failed"); +} + +void OCLPerfMapImageWriteSpeed::run(void) { + CPerfCounter timer; + + void *mem; + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {bufSize_, bufSize_, 1}; + size_t image_row_pitch; + size_t image_slice_pitch; + // Warm up + mem = _wrapper->clEnqueueMapImage( + cmd_queue_, outBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region, + &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_); + + CHECK_RESULT(error_, "clEnqueueMapImage failed"); + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed"); + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < numIter; i++) { + mem = _wrapper->clEnqueueMapImage( + cmd_queue_, outBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region, + &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_); + + CHECK_RESULT(error_, "clEnqueueMapImage failed"); + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed"); + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + } + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Image map write bandwidth in GB/s + double perf = ((double)bufSize_ * bufSize_ * formatSize[bufnum_] * numIter * + (double)(1e-09)) / + sec; + + _perfInfo = (float)perf; + + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s i: %4d (GB/s) ", bufSize_, + bufSize_, textFormats[bufnum_], numIter); + testDescString = buf; +} + +unsigned int OCLPerfMapImageWriteSpeed::close(void) { + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageWriteSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageWriteSpeed.h new file mode 100644 index 0000000000..0e05b4a3a2 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMapImageWriteSpeed.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_MapImageWriteSpeed_H_ +#define _OCL_MapImageWriteSpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerfMapImageWriteSpeed : public OCLTestImp { + public: + OCLPerfMapImageWriteSpeed(); + virtual ~OCLPerfMapImageWriteSpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 100; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int bufSize_; + unsigned int bufnum_; + unsigned int numIter; +}; + +#endif // _OCL_MapImageWriteSpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMatrixTranspose.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMatrixTranspose.cpp new file mode 100644 index 0000000000..7c8bae1d13 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMatrixTranspose.cpp @@ -0,0 +1,326 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfMatrixTranspose.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +static const unsigned int NUM_BLOCK_SIZES = 2; +static const unsigned int blockSizes[NUM_BLOCK_SIZES] = {8, 16}; +static const unsigned int NUM_MATRIX_DIMS = 2; +static const unsigned int matrixDims[NUM_MATRIX_DIMS] = {1024, 1920}; +static const char *matrixtranspose_kernel = + "kernel void matrixTranspose(global uint *restrict inBuf, global uint " + "*restrict outBuf, local uint *localBuf, uint blockSize, uint width, uint " + "height)\n" + "{\n" + " uint globalIdx = get_global_id(0);\n" + " uint globalIdy = get_global_id(1);\n" + + " uint localIdx = get_local_id(0);\n" + " uint localIdy = get_local_id(1);\n" + + " /* copy from input to local memory */\n" + " /* Note that we transpose the x and y coordinates when storing */\n" + " localBuf[localIdx*blockSize + localIdy] = inBuf[globalIdy*width + " + "globalIdx];\n" + + " /* wait until the whole block is filled */\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + + " uint groupIdx = get_group_id(0);\n" + " uint groupIdy = get_group_id(1);\n" + + " /* calculate the corresponding target location for transpose by " + "inverting x and y values*/\n" + " /* Here we don't swap localIdx and localIdy, this is to get larger " + "bursts when threads write to memory. */\n" + " /* To make this work, we've swapped the coordinates when we write to " + "local memory. */\n" + " uint targetGlobalIdx = groupIdy*blockSize + localIdx;\n" + " uint targetGlobalIdy = groupIdx*blockSize + localIdy;\n" + + " /* calculate the corresponding raster indices of source and target " + "*/\n" + " uint targetIndex = targetGlobalIdy*height + targetGlobalIdx;\n" + " uint sourceIndex = localIdy * blockSize + localIdx;\n" + + " outBuf[targetIndex] = localBuf[sourceIndex];\n" + "}\n"; + +OCLPerfMatrixTranspose::OCLPerfMatrixTranspose() { + _numSubTests = NUM_BLOCK_SIZES * NUM_MATRIX_DIMS; +} + +OCLPerfMatrixTranspose::~OCLPerfMatrixTranspose() {} + +void OCLPerfMatrixTranspose::setData(cl_mem buffer) { + unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL, + &error_); + for (unsigned int i = 0; i < height_; i++) { + for (unsigned int j = 0; j < width_; j++) { + *(data + i * width_ + j) = i * width_ + j; + } + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); +} + +void OCLPerfMatrixTranspose::fillData(cl_mem buffer, unsigned int val) { + unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL, + &error_); + for (unsigned int i = 0; i < width_ * height_; i++) { + data[i] = val; + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); +} + +void OCLPerfMatrixTranspose::checkData(cl_mem buffer) { + unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL, + &error_); + bool err = false; + for (unsigned int i = 0; (i < width_) && !err; i++) { + for (unsigned int j = 0; (j < height_) && !err; j++) { + if (*(data + i * height_ + j) != (j * width_ + i)) { + printf("Data mismatch at (%d, %d)! Got %d, expected %d\n", j, i, + *(data + i * height_ + j), j * width_ + i); + err = true; + break; + } + } + break; + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfMatrixTranspose::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + program_ = 0; + kernel_ = 0; + inBuffer_ = 0; + outBuffer_ = 0; + + blockSize_ = blockSizes[_openTest % NUM_BLOCK_SIZES]; + width_ = matrixDims[_openTest / NUM_BLOCK_SIZES]; + height_ = width_; + // We compute a square domain + bufSize_ = width_ * height_ * sizeof(cl_uint); + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + // platform = platforms[_platformIndex]; + // break; + } +#if 0 + } +#endif + delete platforms; + } + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, + "Couldn't find platform with GPU devices, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + char charbuf[1024]; + size_t retsize; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024, + charbuf, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + inBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, bufSize_, + NULL, &error_); + CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed"); + setData(inBuffer_); + + outBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, bufSize_, + NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + fillData(outBuffer_, 0xdeadbeef); + + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&matrixtranspose_kernel, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + char *buildOps = NULL; + error_ = _wrapper->clBuildProgram(program_, 1, &device, buildOps, NULL, NULL); + + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "matrixTranspose", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&inBuffer_); + error_ = + _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_); + error_ = _wrapper->clSetKernelArg( + kernel_, 2, sizeof(cl_uint) * blockSize_ * blockSize_, NULL); + error_ = _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_uint), + (void *)&blockSize_); + error_ = + _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_uint), (void *)&width_); + error_ = + _wrapper->clSetKernelArg(kernel_, 5, sizeof(cl_uint), (void *)&height_); +} + +void OCLPerfMatrixTranspose::run(void) { + size_t global_work_size[2] = {width_, height_}; + size_t local_work_size[2] = {blockSize_, blockSize_}; + + CPerfCounter timer; + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < MAX_ITERATIONS; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 2, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + } + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + _wrapper->clFinish(cmd_queue_); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + checkData(outBuffer_); + // Compute GB/s + double perf = + ((double)bufSize_ * (double)MAX_ITERATIONS * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; + testDescString = ""; + char str[64]; + sprintf(str, "(%d,%d) matrix with (%2d,%2d) block size %fms (GB/s) ", width_, + height_, blockSize_, blockSize_, + (sec / (double)MAX_ITERATIONS) * 1000.); + testDescString += str; +} + +unsigned int OCLPerfMatrixTranspose::close(void) { + _wrapper->clFinish(cmd_queue_); + + if (inBuffer_) { + error_ = _wrapper->clReleaseMemObject(inBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(inBuffer_) failed"); + } + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMatrixTranspose.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMatrixTranspose.h new file mode 100644 index 0000000000..ac5c875162 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMatrixTranspose.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_MATRIX_TRANSPOSE_H_ +#define _OCL_MATRIX_TRANSPOSE_H_ + +#include "OCLTestImp.h" + +class OCLPerfMatrixTranspose : public OCLTestImp { + public: + OCLPerfMatrixTranspose(); + virtual ~OCLPerfMatrixTranspose(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void setData(cl_mem buffer); + void fillData(cl_mem buffer, unsigned int data); + void checkData(cl_mem buffer); + + cl_context context_; + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + cl_mem inBuffer_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int width_; + unsigned int height_; + unsigned int bufSize_; + unsigned int blockSize_; + static const unsigned int MAX_ITERATIONS = 50; +}; + +#endif // _OCL_MATRIX_TRANSPOSE_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCombine.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCombine.cpp new file mode 100644 index 0000000000..057b9e3d25 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCombine.cpp @@ -0,0 +1,234 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfMemCombine.h" + +#include +#include +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +struct TestParams { + const char* type; + unsigned int numCombine; + unsigned int assignSize; +}; + +TestParams testParams[] + // char type causes shader compiler to crash. reenable once get a fix for + // the shader compiler + //= {{"char", 16}, {"short", 8}, {"int", 4}, {"long", 4}, {"float", 4}}; + //= {{"char", 16, 1}, {"short", 8, 2}, {"int", 4, 4}, {"long", 4, 8}, + = {{"short", 8, 2}, {"int", 4, 4}, {"long", 4, 8}, {"float", 4, 4}, + {"char4", 4, 4}, {"uchar16", 4, 16}, {"short2", 4, 4}, {"int2", 4, 8}, + {"uint4", 4, 16}, {"long2", 4, 16}, {"float2", 4, 8}}; + +const int numTests = sizeof(testParams) / sizeof(TestParams); + +// Generate a kernel that does array loads and stores, which should be combined +// by MemCombine +void genCombineVLoadVStores(const char* type, int loopSize, int numCombine, + char* ret) { + sprintf(ret, + "__kernel void combine_vload_vstores(__global %s" + " * restrict src, __global %s *result) {\n", + type, type); + strcat(ret, " int id = get_global_id(0);\n"); + strcat(ret, " int gsize = get_global_size(0);\n"); + char buf[256]; + sprintf(buf, " for (int i = 0; i < %d; i+=gsize) {\n", loopSize); + strcat(ret, buf); + sprintf(buf, " int j = (i+id) * %d;\n", numCombine); + strcat(ret, buf); + for (int i = 0; i < numCombine; ++i) { + sprintf(buf, " result[j+%d] = src[j+%d];\n", i, i); + strcat(ret, buf); + } + strcat(ret, " }\n}\n"); +} + +void OCLPerfMemCombine::setData(cl_mem buffer, unsigned int bufSize, + unsigned char val) { + unsigned char* data = (unsigned char*)_wrapper->clEnqueueMapBuffer( + cmdQueues_[0], buffer, true, CL_MAP_WRITE, 0, bufSize, 0, NULL, NULL, + &error_); + for (unsigned int i = 0; i < bufSize; ++i) data[i] = val; + + error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[0], buffer, data, 0, + NULL, NULL); + _wrapper->clFinish(cmdQueues_[0]); +} + +void print1Darray(unsigned char* buffer, unsigned int bufSize) { + for (unsigned int i = 0; i < bufSize; ++i) { + if (i % 32 == 0) printf("\n"); + printf("%d ", buffer[i]); + } + printf("\n"); +} + +void OCLPerfMemCombine::checkData(cl_mem buffer, unsigned int bufSize, + unsigned int limit, unsigned char defVal) { + unsigned char* data = (unsigned char*)_wrapper->clEnqueueMapBuffer( + cmdQueues_[0], buffer, true, CL_MAP_READ, 0, bufSize, 0, NULL, NULL, + &error_); + for (unsigned int i = 0; i < bufSize; i++) { + unsigned char expected; + if (i < limit) { + expected = 1U; + } else { + expected = defVal; + } + if (data[i] != expected) { + printf("at index %d:\n", i); + print1Darray(&data[i], 16); + CHECK_RESULT(1, "incorrect output data detected!"); + break; + } + } + + error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[0], buffer, data, 0, + NULL, NULL); + _wrapper->clFinish(cmdQueues_[0]); +} + +OCLPerfMemCombine::OCLPerfMemCombine() { _numSubTests = numTests; } + +OCLPerfMemCombine::~OCLPerfMemCombine() {} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLPerfMemCombine::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + _openTest = test; + + context_ = 0; + kernel_ = NULL; + program_ = NULL; + + OCLTestImp::open(test, units, conversion, deviceId); + + cl_mem inBuffer = + _wrapper->clCreateBuffer(context_, 0, inSize_, NULL, &error_); + CHECK_RESULT(inBuffer == 0, "clCreateBuffer(inBuffer) failed"); + buffers_.push_back(inBuffer); + + cl_mem outBuffer = + _wrapper->clCreateBuffer(context_, 0, outSize_, NULL, &error_); + CHECK_RESULT(outBuffer == 0, "clCreateBuffer(outBuffer) failed"); + buffers_.push_back(outBuffer); + + createKernel(testParams[test].type, testParams[test].numCombine); + setData(inBuffer, inSize_, 1U); + setData(outBuffer, outSize_, 0); + dataRange_ = loopSize_ * numCombine_ * testParams[test].assignSize; +} + +void OCLPerfMemCombine::createKernel(const char* type, int numCombine) { + dataType_ = type; + numCombine_ = numCombine; + + ///////////////////////////////////////////////////////////////// + // Load CL file, build CL program object, create CL kernel object + ///////////////////////////////////////////////////////////////// + char source[1024]; + genCombineVLoadVStores(type, loopSize_, numCombine, source); + size_t sourceSize[] = {strlen(source)}; + const char* src = &source[0]; + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &src, sourceSize, + &error_); + CHECK_RESULT(error_ != CL_SUCCESS, "clCreateProgramWithSource failed"); + + /* create a cl program executable for all the devices specified */ + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + return; + } + + /* get a kernel object handle for a kernel with the given name */ + const char* kernelName = "combine_vload_vstores"; + kernel_ = _wrapper->clCreateKernel(program_, kernelName, &error_); + CHECK_RESULT(error_ != CL_SUCCESS, "clCreateProgramWithSource failed"); + + /*** Set appropriate arguments to the kernel ***/ + /* the input array to the kernel */ + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), + (void*)&buffers()[0]); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); + + /* the output array to the kernel */ + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), + (void*)&buffers()[1]); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); +} + +void OCLPerfMemCombine::run(void) { + size_t globalThreads[1]; + size_t localThreads[1]; + + globalThreads[0] = 64; + localThreads[0] = 64; + + CPerfCounter timer; + timer.Reset(); + timer.Start(); + + for (unsigned int i = 0; i < NUM_ITER; ++i) { + /* + * Enqueue a kernel run call. + */ + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[0], kernel_, 1, NULL, + globalThreads, localThreads, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + } + _wrapper->clFinish(cmdQueues_[0]); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + char buf[256]; + SNPRINTF(buf, sizeof(buf), "%d %-8s (sec)", numCombine_, dataType_); + testDescString = buf; + _perfInfo = (float)sec; + + checkData(buffers()[1], outSize_, dataRange_, 0); + return; +} + +unsigned int OCLPerfMemCombine::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCombine.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCombine.h new file mode 100644 index 0000000000..6c7225ceca --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCombine.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_MemCombine_H_ +#define _OCL_MemCombine_H_ + +#include "OCLTestImp.h" + +class OCLPerfMemCombine : public OCLTestImp { + enum { inSize_ = 4096U * 1024U }; + enum { outSize_ = 4096U * 1024U }; + enum { loopSize_ = 8192 }; + + public: + OCLPerfMemCombine(); + virtual ~OCLPerfMemCombine(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 1000; + + const char* dataType_; + unsigned int numCombine_; + unsigned int dataRange_; + unsigned char input[inSize_]; + unsigned char output[outSize_]; + + private: + void createKernel(const char* type, int numCombine); + void setData(cl_mem buffer, unsigned int bufSize, unsigned char val); + void checkData(cl_mem buffer, unsigned int bufSize, unsigned int limit, + unsigned char defVal); +}; + +#endif // _OCL_MemCombine_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCreate.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCreate.cpp new file mode 100644 index 0000000000..d58b8bf381 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCreate.cpp @@ -0,0 +1,176 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfMemCreate.h" + +#include +#include +#include + +#include +#include + +#include "CL/cl.h" +#include "CL/cl_ext.h" + +#if defined(_WIN32) && !defined(_WIN64) +static const size_t BufSize = 0x200000; +static const size_t BufSizeC = 0x100000; +#else +static const size_t BufSize = 0x400000; +static const size_t BufSizeC = 0x200000; +#endif + +static const size_t Iterations = 0x100; +static const size_t IterationsC = 0x1000; + +static const char* strKernel = + "__kernel void dummy(__global uint* out) \n" + "{ \n" + " uint id = get_global_id(0); \n" + " uint value = 1; \n" + " if ((int)get_local_id(0) < 0) \n" + " out[id] = value; \n" + "} \n"; + +#define NUM_TESTS 5 +OCLPerfMemCreate::OCLPerfMemCreate() { + _numSubTests = NUM_TESTS * 2; + failed_ = false; +} + +OCLPerfMemCreate::~OCLPerfMemCreate() {} + +void OCLPerfMemCreate::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + _deviceId = deviceId; + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + test_ = test % NUM_TESTS; + cl_device_type deviceType; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE, + sizeof(deviceType), &deviceType, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed"); + + useSubBuf_ = (test >= NUM_TESTS); + + if (!(deviceType & CL_DEVICE_TYPE_GPU)) { + printf("GPU device is required for this test!\n"); + failed_ = true; + return; + } + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + kernel_ = _wrapper->clCreateKernel(program_, "dummy", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLPerfMemCreate::run(void) { + if (failed_) { + return; + } + cl_mem buffer, subBuf; + cl_mem* bufptr; + unsigned int* values; + values = reinterpret_cast(new cl_int4[BufSize]); + CPerfCounter timer; + cl_mem_flags flags = CL_MEM_READ_ONLY; + void* hostPtr = NULL; + + // Clear destination buffer + memset(values, 0, BufSize * sizeof(cl_int4)); + + size_t bufSize = ((test_ % 2) == 0) ? BufSize * sizeof(cl_int4) + : BufSizeC * sizeof(cl_int4); + size_t iter = ((test_ % 2) == 0) ? Iterations : IterationsC; + + if (test_ == 4) { + hostPtr = values; + bufSize = 0x100000; + flags = CL_MEM_USE_HOST_PTR; + } else if ((test_ / 2) > 0) { + iter = ((test_ % 2) == 0) ? Iterations / 10 : IterationsC; + flags |= CL_MEM_ALLOC_HOST_PTR; + } + timer.Reset(); + timer.Start(); + + for (size_t i = 0; i < iter; ++i) { + buffer = + _wrapper->clCreateBuffer(context_, flags, bufSize, hostPtr, &error_); + bufptr = &buffer; + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + if (useSubBuf_) { + cl_buffer_region reg; + reg.origin = 0; + reg.size = bufSize; + subBuf = _wrapper->clCreateSubBuffer( + buffer, flags, CL_BUFFER_CREATE_TYPE_REGION, ®, &error_); + bufptr = &subBuf; + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateSubBuffer() failed"); + } + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), bufptr); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + size_t gws[1] = {64}; + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + _wrapper->clFinish(cmdQueues_[_deviceId]); + if (useSubBuf_) _wrapper->clReleaseMemObject(subBuf); + _wrapper->clReleaseMemObject(buffer); + } + + timer.Stop(); + std::stringstream stream; + + static const char* Message[] = {" create+destroy time [uncached] ", + " create+destroy time [cached ] "}; + static const char* Type[] = {"DEV", "AHP", "UHP"}; + + stream << Type[test_ / 2]; + stream << Message[test_ % 2]; + stream << " per allocation (ms) "; + stream << bufSize / 1024 << " KB"; + if (useSubBuf_) stream << " subbuf "; + testDescString = stream.str(); + _perfInfo = static_cast(timer.GetElapsedTime() * 1000 / iter); + + delete[] values; +} + +unsigned int OCLPerfMemCreate::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCreate.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCreate.h new file mode 100644 index 0000000000..790b09a3a3 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemCreate.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PERF_MEM_CREATE_H_ +#define _OCL_PERF_MEM_CREATE_H_ + +#include "OCLTestImp.h" + +class OCLPerfMemCreate : public OCLTestImp { + public: + OCLPerfMemCreate(); + virtual ~OCLPerfMemCreate(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; + unsigned int test_; + bool useSubBuf_; +}; + +#endif // _OCL_PERF_MEM_CREATE_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemLatency.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemLatency.cpp new file mode 100644 index 0000000000..d6d1c4828c --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemLatency.cpp @@ -0,0 +1,418 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfMemLatency.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +static const unsigned int NUM_SIZES = 16; +// 2k up to 64MB +static const unsigned int Sizes[NUM_SIZES] = { + 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, + 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864}; +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif +void OCLPerfMemLatency::genShader() { + shader_.clear(); + + // DO NOT PUBLISH + // Adopted from SiSoft Sandra 2013's memory latency test + shader_ += + "#ifdef MAKEVOLATILE\n" + "#define VOLATILE volatile\n" + "#else\n" + "#define VOLATILE\n" + "#endif\n" + "__kernel\n" + //"__attribute__((work_group_size_hint(1, 1, 1)))\n" + "void MemWalker(\n" + " global VOLATILE uint * restrict input,\n" + " __global uint * restrict output,\n" + " const uint uCount, const uint uSize,\n" + " const uint uOffset, const int bMem, const uint repeats)\n" + "{\n" + " uint o = uOffset;\n" + " uint lid = 0;//get_local_id(0)*o;\n" + " uint x = lid;\n" + "\n" + " for (uint loop = 0; loop < repeats; loop++) {\n" + " uint i = uCount;\n" + " while (i--) {\n" + " x = input[x] /* + o*/;\n" + " }\n" + " }\n" + "\n" + "#ifdef MAKERW\n" + " input[0] = x;\n" + "#endif\n" + " output[0] = x;\n" + "}\n"; + + // printf("shader:\n%s\n", shader_.c_str()); + shader_ += "\n\n"; + shader_ += + "__kernel\n" + //"__attribute__((work_group_size_hint(1, 1, 1)))\n" + "void Overhead(\n" + " __global uint * restrict input,\n" + " __global uint * restrict output,\n" + " const uint uCount, const uint uSize,\n" + " const uint uOffset, const int bMem, const uint repeats)\n" + "{\n" + "#ifdef USE_FLOAT\n" + " float x = (float)input[0];\n" + " for (uint loop = 0; loop < repeats; loop++) {\n" + " uint i = uCount;\n" + " x = (float)uOffset*x;\n" + " while (i--) {\n" + " x += (float)i;\n" + " }\n" + " }\n" + " output[0] = (uint)x;\n" + "#else\n" + " uint x = input[0];\n" + " for (uint loop = 0; loop < repeats; loop++) {\n" + " uint i = uCount;\n" + " x = x*uOffset;\n" + " while (i--) {\n" + " x += i;\n" + " }\n" + " }\n" + " output[0] = x;\n" + "#endif\n" + "}\n"; +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +OCLPerfMemLatency::OCLPerfMemLatency() { + _numSubTests = NUM_SIZES * 6; + maxSize_ = Sizes[NUM_SIZES - 1]; +} + +OCLPerfMemLatency::~OCLPerfMemLatency() {} + +void OCLPerfMemLatency::setData(cl_mem buffer, unsigned int val) { + void *ptr = + _wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_WRITE, 0, + width_, 0, NULL, NULL, &error_); + unsigned int *data = (unsigned int *)ptr; + for (unsigned int i = 0; i < bufSizeDW_; i++) { + data[(i * (1024 + 17)) % bufSizeDW_] = ((i + 1) * (1024 + 17)) % bufSizeDW_; + } + error_ = + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL); + clFinish(cmd_queue_); +} + +void OCLPerfMemLatency::checkData(cl_mem buffer) { + void *ptr = + _wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_READ, 0, + sizeof(cl_uint), 0, NULL, NULL, &error_); + + unsigned int *data = (unsigned int *)ptr; + if (data[0] != 0) { + printf("OutData= 0x%08x\n", data[0]); + CHECK_RESULT_NO_RETURN(data[0] != 0, "Data validation failed!\n"); + } + error_ = + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL); +} + +void OCLPerfMemLatency::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + moreThreads = false; + + context_ = 0; + cmd_queue_ = 0; + program_ = 0; + kernel_ = 0; + inBuffer_ = 0; + outBuffer_ = 0; + _errorFlag = false; // Reset error code so a single error doesn't prevent + // other subtests from running + _errorMsg = ""; + isAMD_ = false; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + isAMD_ = true; + } + } + + delete platforms; + } + + width_ = Sizes[test % NUM_SIZES]; + + bufSizeDW_ = width_ / sizeof(cl_uint); + moreThreads = ((test / NUM_SIZES) % 2) ? true : false; + makeVolatile = (test >= 2 * NUM_SIZES) ? true : false; + makeRW = (test >= 4 * NUM_SIZES) ? true : false; + + CHECK_RESULT(platform == 0, "Couldn't find OpenCL platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "Failed to allocate devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + device = devices[0]; + + free(devices); + devices = NULL; + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + inBuffer_ = _wrapper->clCreateBuffer(context_, 0, width_, NULL, &error_); + CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed"); + + outBuffer_ = + _wrapper->clCreateBuffer(context_, 0, 1 * sizeof(cl_uint), NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + genShader(); + char *tmp = (char *)shader_.c_str(); + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&tmp, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + std::string args; + args.clear(); + if (isAMD_) args += " -D USE_FLOAT"; + if (makeVolatile) args += " -D MAKEVOLATILE"; + if (makeRW) args += " -D MAKERW"; + + error_ = + _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL); + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "MemWalker", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel(MemWalker) failed"); + + kernel2_ = _wrapper->clCreateKernel(program_, "Overhead", &error_); + CHECK_RESULT(kernel2_ == 0, "clCreateKernel(Overhead) failed"); + + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&inBuffer_); + error_ = + _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_); + error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint), + (void *)&bufSizeDW_); + error_ = _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_uint), + (void *)&bufSizeDW_); + unsigned int zero = 0; + error_ = _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_uint), (void *)&zero); + int bMem = 1; + error_ = _wrapper->clSetKernelArg(kernel_, 5, sizeof(cl_int), (void *)&bMem); + // Limit the repeats, large buffers will have more samples, but the test runs + // for a long time + repeats_ = std::max((maxSize_ >> 4) / bufSizeDW_, 1u); + error_ = + _wrapper->clSetKernelArg(kernel_, 6, sizeof(cl_uint), (void *)&repeats_); + + error_ = + _wrapper->clSetKernelArg(kernel2_, 0, sizeof(cl_mem), (void *)&inBuffer_); + error_ = _wrapper->clSetKernelArg(kernel2_, 1, sizeof(cl_mem), + (void *)&outBuffer_); + error_ = _wrapper->clSetKernelArg(kernel2_, 2, sizeof(cl_uint), + (void *)&bufSizeDW_); + error_ = _wrapper->clSetKernelArg(kernel2_, 3, sizeof(cl_uint), + (void *)&bufSizeDW_); + error_ = + _wrapper->clSetKernelArg(kernel2_, 4, sizeof(cl_uint), (void *)&zero); + error_ = _wrapper->clSetKernelArg(kernel2_, 5, sizeof(cl_int), (void *)&bMem); + error_ = + _wrapper->clSetKernelArg(kernel2_, 6, sizeof(cl_uint), (void *)&repeats_); + + setData(inBuffer_, (int)1.0f); +} + +void OCLPerfMemLatency::run(void) { + int global = 1; + int local = 1; + + if (moreThreads) { + if (isAMD_) { + global *= 64; + local *= 64; + } else { + global *= 32; + local *= 32; + } + } + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + + // Warm-up + unsigned int warmup = 128; + error_ = + _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint), (void *)&warmup); + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint), + (void *)&bufSizeDW_); + _wrapper->clFinish(cmd_queue_); + + // Restore input buffer when finished as it may have been modified by RW test + setData(inBuffer_, (int)1.0f); + + CPerfCounter timer, timer2; + + timer.Reset(); + timer.Start(); + + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + + _wrapper->clFinish(cmd_queue_); + + timer.Stop(); + + checkData(outBuffer_); + + timer2.Reset(); + timer2.Start(); + + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel2_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + + _wrapper->clFinish(cmd_queue_); + + timer2.Stop(); + double sec = timer.GetElapsedTime() - timer2.GetElapsedTime(); + + // Read latency in ns + double perf = sec * (double)(1e09) / ((double)bufSizeDW_ * (double)repeats_); + + _perfInfo = (float)perf; + char buf[256]; + char buf2[32]; + if (makeRW) + SNPRINTF(buf2, sizeof(buf), "volatileRW"); + else if (makeVolatile) + SNPRINTF(buf2, sizeof(buf), "volatile"); + else + buf2[0] = '\0'; + SNPRINTF(buf, sizeof(buf), "%10s %2d threads, %8d reads, %5d repeats (ns)", + buf2, global, bufSizeDW_, repeats_); + testDescString = buf; +} + +unsigned int OCLPerfMemLatency::close(void) { + _wrapper->clFinish(cmd_queue_); + + if (inBuffer_) { + error_ = _wrapper->clReleaseMemObject(inBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(inBuffer_) failed"); + } + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (kernel2_) { + error_ = _wrapper->clReleaseKernel(kernel2_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemLatency.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemLatency.h new file mode 100644 index 0000000000..0e2f0f4e98 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfMemLatency.h @@ -0,0 +1,61 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_MEMLATENCY_H_ +#define _OCL_MEMLATENCY_H_ + +#include "OCLTestImp.h" + +class OCLPerfMemLatency : public OCLTestImp { + public: + OCLPerfMemLatency(); + virtual ~OCLPerfMemLatency(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void genShader(void); + void setData(cl_mem buffer, unsigned int data); + void checkData(cl_mem buffer); + + cl_context context_; + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + cl_kernel kernel2_; + cl_mem inBuffer_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int width_; + unsigned int bufSizeDW_; + unsigned int repeats_; + unsigned int maxSize_; + bool isAMD_; + bool moreThreads; + bool makeVolatile; + bool makeRW; +}; + +#endif // _OCL_MEMLATENCY_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferReadSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferReadSpeed.cpp new file mode 100644 index 0000000000..f29724fc8b --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferReadSpeed.cpp @@ -0,0 +1,347 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfPinnedBufferReadSpeed.h" + +#include +#include +#include + +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 8 +// 256KB, 1 MB, 4MB, 16 MB +static const unsigned int Sizes[NUM_SIZES] = { + 1024, 4 * 1024, 8 * 1024, 16 * 1024, 262144, 1048576, 4194304, 16777216}; + +static const unsigned int Iterations[2] = { + 1, OCLPerfPinnedBufferReadSpeed::NUM_ITER}; +#define NUM_OFFSETS 2 +static const unsigned int offsets[NUM_OFFSETS] = {0, 16}; +#define NUM_SUBTESTS (1 + NUM_OFFSETS) + +static cl_uint blockedSubtests; + +OCLPerfPinnedBufferReadSpeed::OCLPerfPinnedBufferReadSpeed() { + _numSubTests = NUM_SIZES * NUM_SUBTESTS * 2; + blockedSubtests = _numSubTests; + _numSubTests += NUM_SIZES * NUM_SUBTESTS; +} + +OCLPerfPinnedBufferReadSpeed::~OCLPerfPinnedBufferReadSpeed() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +const char *blkStr[2] = {"n/b", "blk"}; + +void OCLPerfPinnedBufferReadSpeed::open(unsigned int test, char *units, + double &conversion, + unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + inBuffer_ = 0; + outBuffer_ = 0; + persistent = false; + allocHostPtr = false; + useHostPtr = false; + hostMem = NULL; + alignedMem = NULL; + alignment = 4096; + isAMD = false; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + isAMD = true; + } + // platform = platforms[_platformIndex]; + // break; + } +#if 0 + } +#endif + delete platforms; + } + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + char getVersion[128]; + error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION, + sizeof(getVersion), getVersion, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed"); + platformVersion[0] = getVersion[7]; + platformVersion[1] = getVersion[8]; + platformVersion[2] = getVersion[9]; + platformVersion[3] = '\0'; + bufSize_ = Sizes[_openTest % NUM_SIZES]; + + if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 0) { + useHostPtr = true; + offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 1]; + } else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 0) { + allocHostPtr = true; + } + + if (_openTest < blockedSubtests) { + numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS)]; + } else { + numIter = 4 * OCLPerfPinnedBufferReadSpeed::NUM_ITER / + ((_openTest % NUM_SIZES) + 1); + } + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_mem_flags flags = CL_MEM_WRITE_ONLY; + if (allocHostPtr) { + flags |= CL_MEM_ALLOC_HOST_PTR; + } else if (useHostPtr) { + flags |= CL_MEM_USE_HOST_PTR; + hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset); + CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed"); + alignedMem = + (char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) + + offset); + } + inBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, 0, &error_); + CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + outBuffer_ = + _wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + // Force memory to be on GPU if possible + { + cl_mem memBuffer = + _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed"); + + _wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, outBuffer_, 0, 0, + bufSize_, 0, NULL, NULL); + _wrapper->clFinish(cmd_queue_); + + _wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, inBuffer_, 0, 0, + bufSize_, 0, NULL, NULL); + _wrapper->clFinish(cmd_queue_); + + _wrapper->clReleaseMemObject(memBuffer); + } +} + +void OCLPerfPinnedBufferReadSpeed::run(void) { + CPerfCounter timer; + void *mem = + _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE, CL_MAP_READ, + 0, bufSize_, 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE; + + // Warm up + error_ = _wrapper->clEnqueueReadBuffer(cmd_queue_, inBuffer_, CL_TRUE, 0, + bufSize_, mem, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueReadBuffer failed"); + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < numIter; i++) { + error_ = _wrapper->clEnqueueReadBuffer(cmd_queue_, inBuffer_, blocking, 0, + bufSize_, mem, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueReadBuffer failed"); + } + + if (blocking != CL_TRUE) { + _wrapper->clFinish(cmd_queue_); + } + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Buffer read bandwidth in GB/s + double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; + char str[256]; + if (allocHostPtr) { + SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)"); + } else if (useHostPtr) { + SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset); + } + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %31s ", bufSize_, + blkStr[blocking], numIter, str); + testDescString = buf; + + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueUnmapMemObject failed"); +} + +unsigned int OCLPerfPinnedBufferReadSpeed::close(void) { + _wrapper->clFinish(cmd_queue_); + if (inBuffer_) { + error_ = _wrapper->clReleaseMemObject(inBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(inBuffer_) failed"); + } + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + if (hostMem) { + free(hostMem); + } + + return _crcword; +} + +void OCLPerfPinnedBufferReadRectSpeed::run(void) { + CPerfCounter timer; + void *mem = + _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE, CL_MAP_READ, + 0, bufSize_, 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + size_t width = static_cast(sqrt(static_cast(bufSize_))); + cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE; + size_t bufOrigin[3] = {0, 0, 0}; + size_t hostOrigin[3] = {0, 0, 0}; + size_t region[3] = {width, width, 1}; + // Clamp iteration count to reduce test run time + unsigned int testNumIter; + testNumIter = (numIter < 100 ? numIter : 100); + + // Skip for 1.0 platforms + if ((platformVersion[0] == '1') && (platformVersion[2] == '0')) { + testDescString = " SKIPPED "; + return; + } + // Warm up + error_ = _wrapper->clEnqueueReadBufferRect( + cmd_queue_, inBuffer_, CL_TRUE, bufOrigin, hostOrigin, region, width, 0, + width, 0, mem, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueReadBufferRect failed"); + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < testNumIter; i++) { + error_ = _wrapper->clEnqueueReadBufferRect( + cmd_queue_, inBuffer_, blocking, bufOrigin, hostOrigin, region, width, + 0, width, 0, mem, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueReadBufferRect failed"); + } + + if (blocking != CL_TRUE) { + _wrapper->clFinish(cmd_queue_); + } + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Buffer read bandwidth in GB/s + double perf = ((double)bufSize_ * testNumIter * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; + char str[256]; + if (allocHostPtr) { + SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)"); + } else if (useHostPtr) { + SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset); + } + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %31s ", bufSize_, + blkStr[blocking], testNumIter, str); + testDescString = buf; + + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueUnmapMemObject failed"); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferReadSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferReadSpeed.h new file mode 100644 index 0000000000..1999cac84e --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferReadSpeed.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PinnedBufferReadSpeed_H_ +#define _OCL_PinnedBufferReadSpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerfPinnedBufferReadSpeed : public OCLTestImp { + public: + OCLPerfPinnedBufferReadSpeed(); + virtual ~OCLPerfPinnedBufferReadSpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 1000; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_mem inBuffer_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int bufSize_; + bool persistent; + bool allocHostPtr; + bool useHostPtr; + unsigned int numIter; + char* hostMem; + char* alignedMem; + size_t alignment; + unsigned int offset; + bool isAMD; + char platformVersion[32]; +}; + +class OCLPerfPinnedBufferReadRectSpeed : public OCLPerfPinnedBufferReadSpeed { + public: + OCLPerfPinnedBufferReadRectSpeed() : OCLPerfPinnedBufferReadSpeed() {} + + public: + virtual void run(void); +}; + +#endif // _OCL_PinnedBufferReadSpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferWriteSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferWriteSpeed.cpp new file mode 100644 index 0000000000..2fccd41163 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferWriteSpeed.cpp @@ -0,0 +1,342 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfPinnedBufferWriteSpeed.h" + +#include +#include +#include + +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 8 +// 256KB, 1 MB, 4MB, 16 MB +static const unsigned int Sizes[NUM_SIZES] = { + 1024, 4 * 1024, 8 * 1024, 16 * 1024, 262144, 1048576, 4194304, 16777216}; + +static cl_uint blockedSubtests; + +static const unsigned int Iterations[2] = { + 1, OCLPerfPinnedBufferWriteSpeed::NUM_ITER}; +#define NUM_OFFSETS 2 +static const unsigned int offsets[NUM_OFFSETS] = {0, 16}; +#define NUM_SUBTESTS (1 + NUM_OFFSETS) +OCLPerfPinnedBufferWriteSpeed::OCLPerfPinnedBufferWriteSpeed() { + _numSubTests = NUM_SIZES * NUM_SUBTESTS * 2; + blockedSubtests = _numSubTests; + _numSubTests += NUM_SIZES * NUM_SUBTESTS; +} + +OCLPerfPinnedBufferWriteSpeed::~OCLPerfPinnedBufferWriteSpeed() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +extern const char *blkStr[2]; + +void OCLPerfPinnedBufferWriteSpeed::open(unsigned int test, char *units, + double &conversion, + unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + outBuffer_ = 0; + persistent = false; + allocHostPtr = false; + useHostPtr = false; + hostMem = NULL; + alignedMem = NULL; + alignment = 4096; + isAMD = false; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + isAMD = true; + } + // platform = platforms[_platformIndex]; + // break; + } +#if 0 + } +#endif + delete platforms; + } + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + char getVersion[128]; + error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION, + sizeof(getVersion), getVersion, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed"); + platformVersion[0] = getVersion[7]; + platformVersion[1] = getVersion[8]; + platformVersion[2] = getVersion[9]; + platformVersion[3] = '\0'; + bufSize_ = Sizes[_openTest % NUM_SIZES]; + + if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 0) { + useHostPtr = true; + offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 1]; + } else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 0) { + allocHostPtr = true; + } + + if (_openTest < blockedSubtests) { + numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS)]; + } else { + numIter = 4 * OCLPerfPinnedBufferWriteSpeed::NUM_ITER / + ((_openTest % NUM_SIZES) + 1); + } + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_mem_flags flags = CL_MEM_READ_ONLY; + if (allocHostPtr) { + flags |= CL_MEM_ALLOC_HOST_PTR; + } else if (useHostPtr) { + flags |= CL_MEM_USE_HOST_PTR; + hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset); + CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed"); + alignedMem = + (char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) + + offset); + } + inBuffer_ = + _wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_); + CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed"); + outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, 0, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + // Force memory to be on GPU if possible + { + cl_mem memBuffer = + _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed"); + + _wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, inBuffer_, 0, 0, + bufSize_, 0, NULL, NULL); + _wrapper->clFinish(cmd_queue_); + + _wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, outBuffer_, 0, 0, + bufSize_, 0, NULL, NULL); + _wrapper->clFinish(cmd_queue_); + + _wrapper->clReleaseMemObject(memBuffer); + } +} + +void OCLPerfPinnedBufferWriteSpeed::run(void) { + CPerfCounter timer; + void *mem = + _wrapper->clEnqueueMapBuffer(cmd_queue_, inBuffer_, CL_TRUE, CL_MAP_WRITE, + 0, bufSize_, 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE; + + // Warm up + error_ = _wrapper->clEnqueueWriteBuffer(cmd_queue_, outBuffer_, CL_TRUE, 0, + bufSize_, mem, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueWriteBuffer failed"); + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < numIter; i++) { + error_ = _wrapper->clEnqueueWriteBuffer(cmd_queue_, outBuffer_, blocking, 0, + bufSize_, mem, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueWriteBuffer failed"); + } + if (blocking != CL_TRUE) { + _wrapper->clFinish(cmd_queue_); + } + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Buffer read bandwidth in GB/s + double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; + char str[256]; + if (allocHostPtr) { + SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)"); + } else if (useHostPtr) { + SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset); + } + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %31s ", bufSize_, + blkStr[blocking], numIter, str); + testDescString = buf; + + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, inBuffer_, mem, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueUnmapMemObject failed"); +} + +unsigned int OCLPerfPinnedBufferWriteSpeed::close(void) { + _wrapper->clFinish(cmd_queue_); + if (inBuffer_) { + error_ = _wrapper->clReleaseMemObject(inBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(inBuffer_) failed"); + } + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + if (hostMem) { + free(hostMem); + } + + return _crcword; +} + +void OCLPerfPinnedBufferWriteRectSpeed::run(void) { + CPerfCounter timer; + void *mem = + _wrapper->clEnqueueMapBuffer(cmd_queue_, inBuffer_, CL_TRUE, CL_MAP_READ, + 0, bufSize_, 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + size_t width = static_cast(sqrt(static_cast(bufSize_))); + size_t bufOrigin[3] = {0, 0, 0}; + size_t hostOrigin[3] = {0, 0, 0}; + size_t region[3] = {width, width, 1}; + // Clamp iteration count to reduce test run time + unsigned int testNumIter; + testNumIter = (numIter < 100 ? numIter : 100); + cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE; + + // Skip for 1.0 platforms + if ((platformVersion[0] == '1') && (platformVersion[2] == '0')) { + testDescString = " SKIPPED "; + return; + } + // Warm up + error_ = _wrapper->clEnqueueWriteBufferRect( + cmd_queue_, outBuffer_, CL_TRUE, bufOrigin, hostOrigin, region, width, 0, + width, 0, mem, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueReadBufferRect failed"); + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < testNumIter; i++) { + error_ = _wrapper->clEnqueueWriteBufferRect( + cmd_queue_, outBuffer_, blocking, bufOrigin, hostOrigin, region, width, + 0, width, 0, mem, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueWriteBufferRect failed"); + } + if (blocking != CL_TRUE) { + _wrapper->clFinish(cmd_queue_); + } + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Buffer read bandwidth in GB/s + double perf = ((double)bufSize_ * testNumIter * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; + char str[256]; + if (allocHostPtr) { + SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)"); + } else if (useHostPtr) { + SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset); + } + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %31s ", bufSize_, + blkStr[blocking], testNumIter, str); + testDescString = buf; + + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, inBuffer_, mem, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueUnmapMemObject failed"); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferWriteSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferWriteSpeed.h new file mode 100644 index 0000000000..40b2620053 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPinnedBufferWriteSpeed.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PinnedBufferWriteSpeed_H_ +#define _OCL_PinnedBufferWriteSpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerfPinnedBufferWriteSpeed : public OCLTestImp { + public: + OCLPerfPinnedBufferWriteSpeed(); + virtual ~OCLPerfPinnedBufferWriteSpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 1000; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_mem inBuffer_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int bufSize_; + bool persistent; + bool allocHostPtr; + bool useHostPtr; + unsigned int numIter; + char* hostMem; + char* alignedMem; + size_t alignment; + unsigned int offset; + bool isAMD; + char platformVersion[32]; +}; + +class OCLPerfPinnedBufferWriteRectSpeed : public OCLPerfPinnedBufferWriteSpeed { + public: + OCLPerfPinnedBufferWriteRectSpeed() : OCLPerfPinnedBufferWriteSpeed() {} + + public: + virtual void run(void); +}; + +#endif // _OCL_PinnedBufferWriteSpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPipeCopySpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPipeCopySpeed.cpp new file mode 100644 index 0000000000..3c4bfd66a3 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPipeCopySpeed.cpp @@ -0,0 +1,504 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfPipeCopySpeed.h" + +#include +#include +#include + +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define KERNEL_CODE(...) #__VA_ARGS__ + +const static char * strKernel = +{ + KERNEL_CODE( + \n + kernel void initPipe(global DATA_TYPE* inBuf, write_only pipe DATA_TYPE outPipe)\n + {\n + int gid = get_global_id(0);\n + write_pipe(outPipe, &inBuf[gid]);\n + }\n + \n + kernel void copyPipe(read_only pipe DATA_TYPE inPipe, write_only pipe DATA_TYPE outPipe)\n + {\n + DATA_TYPE tmp;\n + read_pipe(inPipe, &tmp);\n + write_pipe(outPipe, &tmp);\n + }\n + \n + kernel void readPipe(read_only pipe DATA_TYPE inPipe, global DATA_TYPE* outBuf)\n + {\n + int gid = get_global_id(0);\n + DATA_TYPE tmp;\n + read_pipe(inPipe, &tmp);\n + outBuf[gid] = tmp;\n + }\n + \n + kernel void initPipe_reserve(global DATA_TYPE* inBuf, write_only pipe DATA_TYPE outPipe)\n + {\n + int gid = get_global_id(0);\n + local reserve_id_t resId;\n + resId = reserve_write_pipe(outPipe, 1);\n + if (is_valid_reserve_id(resId)) {\n + write_pipe(outPipe, resId, 0, &inBuf[gid]);\n + commit_write_pipe(outPipe, resId);\n + }\n + }\n + \n + kernel void copyPipe_reserve(read_only pipe DATA_TYPE inPipe, write_only pipe DATA_TYPE outPipe)\n + {\n + local reserve_id_t resId;\n + resId = reserve_read_pipe(inPipe, 1);\n + if (is_valid_reserve_id(resId)) {\n + DATA_TYPE tmp;\n + read_pipe(inPipe, resId, 0, &tmp);\n + commit_read_pipe(inPipe, resId);\n + resId = reserve_write_pipe(outPipe, 1);\n + if (is_valid_reserve_id(resId)) {\n + write_pipe(outPipe, resId, 0, &tmp);\n + commit_write_pipe(outPipe, resId);\n + }\n + }\n + }\n + \n + kernel void readPipe_reserve(read_only pipe DATA_TYPE inPipe, global DATA_TYPE* outBuf)\n + {\n + int gid = get_global_id(0);\n + local reserve_id_t resId;\n + resId = reserve_read_pipe(inPipe, 1);\n + if (is_valid_reserve_id(resId)) {\n + DATA_TYPE tmp;\n + read_pipe(inPipe, resId, 0, &tmp);\n + commit_read_pipe(inPipe, resId);\n + outBuf[gid] = tmp;\n + }\n + }\n + \n + kernel void initPipe_wg(global DATA_TYPE* inBuf, write_only pipe DATA_TYPE outPipe)\n + {\n + int gid = get_global_id(0);\n + local reserve_id_t resId;\n + resId = work_group_reserve_write_pipe(outPipe, get_local_size(0));\n + if (is_valid_reserve_id(resId)) {\n + write_pipe(outPipe, resId, get_local_id(0), &inBuf[gid]);\n + work_group_commit_write_pipe(outPipe, resId);\n + }\n + }\n + \n + kernel void copyPipe_wg(read_only pipe DATA_TYPE inPipe, write_only pipe DATA_TYPE outPipe)\n + {\n + local reserve_id_t resId;\n + resId = work_group_reserve_read_pipe(inPipe, get_local_size(0));\n + if (is_valid_reserve_id(resId)) {\n + DATA_TYPE tmp;\n + read_pipe(inPipe, resId, get_local_id(0), &tmp);\n + work_group_commit_read_pipe(inPipe, resId);\n + resId = work_group_reserve_write_pipe(outPipe, get_local_size(0));\n + if (is_valid_reserve_id(resId)) {\n + write_pipe(outPipe, resId, get_local_id(0), &tmp);\n + work_group_commit_write_pipe(outPipe, resId);\n + }\n + }\n + }\n + \n + kernel void readPipe_wg(read_only pipe DATA_TYPE inPipe, global DATA_TYPE* outBuf)\n + {\n + int gid = get_global_id(0);\n + local reserve_id_t resId;\n + resId = work_group_reserve_read_pipe(inPipe, get_local_size(0));\n + if (is_valid_reserve_id(resId)) {\n + DATA_TYPE tmp;\n + read_pipe(inPipe, resId, get_local_id(0), &tmp);\n + work_group_commit_read_pipe(inPipe, resId);\n + outBuf[gid] = tmp;\n + }\n + }\n + \n +\x23 ifdef SUBGROUPS\n + \x23 pragma OPENCL EXTENSION cl_khr_subgroups : enable\n + kernel __attribute__((reqd_work_group_size(64,1,1))) void initPipe_sg(global DATA_TYPE* inBuf, write_only pipe DATA_TYPE outPipe)\n + {\n + int gid = get_global_id(0);\n + local reserve_id_t resId;\n + resId = sub_group_reserve_write_pipe(outPipe, get_local_size(0));\n + if (is_valid_reserve_id(resId)) {\n + write_pipe(outPipe, resId, get_local_id(0), &inBuf[gid]);\n + sub_group_commit_write_pipe(outPipe, resId);\n + }\n + }\n + \n + kernel __attribute__((reqd_work_group_size(64,1,1))) void copyPipe_sg(read_only pipe DATA_TYPE inPipe, write_only pipe DATA_TYPE outPipe)\n + {\n + local reserve_id_t resId;\n + resId = sub_group_reserve_read_pipe(inPipe, get_local_size(0));\n + if (is_valid_reserve_id(resId)) {\n + DATA_TYPE tmp;\n + read_pipe(inPipe, resId, get_local_id(0), &tmp);\n + sub_group_commit_read_pipe(inPipe, resId);\n + resId = sub_group_reserve_write_pipe(outPipe, get_local_size(0));\n + if (is_valid_reserve_id(resId)) {\n + write_pipe(outPipe, resId, get_local_id(0), &tmp);\n + sub_group_commit_write_pipe(outPipe, resId);\n + }\n + }\n + }\n + \n + kernel __attribute__((reqd_work_group_size(64,1,1))) void readPipe_sg(read_only pipe DATA_TYPE inPipe, global DATA_TYPE* outBuf)\n + {\n + int gid = get_global_id(0);\n + local reserve_id_t resId;\n + resId = sub_group_reserve_read_pipe(inPipe, get_local_size(0));\n + if (is_valid_reserve_id(resId)) {\n + DATA_TYPE tmp;\n + read_pipe(inPipe, resId, get_local_id(0), &tmp);\n + sub_group_commit_read_pipe(inPipe, resId);\n + outBuf[gid] = tmp;\n + }\n + }\n +\x23 endif\n + \n + ) +}; + +#define NUM_SIZES 6 +// 4KB, 8KB, 64KB, 256KB, 1 MB, 4MB +static const unsigned int Sizes[NUM_SIZES] = {4096, 8192, 65536, + 262144, 1048576, 4194304}; + +#define NUM_TYPES 3 +static const char *types[NUM_TYPES] = {"int", "int4", "int16"}; +static const unsigned int typeSize[NUM_TYPES] = {4, 16, 64}; + +#define NUM_TESTS 4 + +OCLPerfPipeCopySpeed::OCLPerfPipeCopySpeed() { + _numSubTests = NUM_TESTS * NUM_SIZES * NUM_TYPES; +} + +OCLPerfPipeCopySpeed::~OCLPerfPipeCopySpeed() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfPipeCopySpeed::setData(cl_mem buffer) { + int *mem; + int dwTypeSize = (int)(typeSize[typeIdx_]) >> 2; + mem = (int *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, CL_TRUE, + CL_MAP_WRITE, 0, bufSize_, 0, NULL, + NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + for (int i = 0; i < (int)numElements; i++) { + for (int j = 0; j < dwTypeSize; j++) { + mem[i * dwTypeSize + j] = i; + } + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, (void *)mem, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed"); + clFinish(cmd_queue_); +} + +void OCLPerfPipeCopySpeed::checkData(cl_mem buffer) { + int *mem; + int dwTypeSize = (int)(typeSize[typeIdx_]) >> 2; + char *histo; + histo = (char *)malloc(numElements * sizeof(char)); + memset(histo, 0, sizeof(char) * numElements); + mem = (int *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, CL_TRUE, + CL_MAP_READ, 0, bufSize_, 0, NULL, + NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + int errCnt = 0; + for (int i = 0; (i < (int)numElements) && (errCnt < 5); i++) { + int tmp = mem[dwTypeSize * i]; + for (int j = 1; (j < dwTypeSize) && (errCnt < 5); j++) { + if (mem[i * dwTypeSize + j] != tmp) { + // BAD DATA! + printf("BAD DATA at element %d, ref %d, got %d\n", i, tmp, + mem[i * dwTypeSize + j]); + errCnt++; + } + } + if (histo[tmp] == 1) { + printf("BAD DATA at element %d, val %d already found!\n", i, tmp); + errCnt++; + } + histo[tmp] = 1; + } + errCnt = 0; + for (int i = 0; (i < (int)numElements) && (errCnt < 5); i++) { + if (histo[i] != 1) { + printf("BAD DATA at element %d, val not found!\n", i); + errCnt++; + } + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, (void *)mem, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed"); + clFinish(cmd_queue_); + free(histo); +} + +void OCLPerfPipeCopySpeed::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + _crcword = 0; + conversion = 1.0f; + + cl_device_id device = devices_[deviceId]; + cmd_queue_ = cmdQueues_[_deviceId]; + + program_ = 0; + initPipe_ = 0; + copyPipe_ = 0; + readPipe_ = 0; + srcBuffer_ = 0; + dstBuffer_ = 0; + pipe_[0] = 0; + pipe_[1] = 0; + failed_ = false; + subgroupSupport_ = false; + + bufSize_ = Sizes[test % NUM_SIZES]; + typeIdx_ = (test / NUM_SIZES) % NUM_TYPES; + testIdx_ = test / (NUM_SIZES * NUM_TYPES); + + numIter = NUM_ITER; + + char getVersion[128]; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_VERSION, + sizeof(getVersion), getVersion, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (getVersion[7] < '2') { + failed_ = true; + _errorMsg = "OpenCL 2.0 not supported"; + return; + } + + srcBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, bufSize_, + NULL, &error_); + CHECK_RESULT(srcBuffer_ == 0, "clCreateBuffer(srcBuffer) failed"); + + numElements = bufSize_ / typeSize[typeIdx_]; + char args[100]; + +#if defined(CL_VERSION_2_0) + pipe_[0] = + _wrapper->clCreatePipe(context_, CL_MEM_HOST_NO_ACCESS, + typeSize[typeIdx_], numElements, NULL, &error_); + CHECK_RESULT(pipe_[0] == 0, "clCreatePipe(pipe_[0]) failed"); + + pipe_[1] = + _wrapper->clCreatePipe(context_, CL_MEM_HOST_NO_ACCESS, + typeSize[typeIdx_], numElements, NULL, &error_); + CHECK_RESULT(pipe_[1] == 0, "clCreatePipe(pipe_[1]) failed"); + + char charbuf[1024]; + size_t retsize; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024, + charbuf, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + char *p = strstr(charbuf, "cl_khr_subgroups"); + if (p) { + subgroupSupport_ = true; + SNPRINTF(args, sizeof(args), "-cl-std=CL2.0 -D DATA_TYPE=%s -D SUBGROUPS", + types[typeIdx_]); + } else { + if (test >= (NUM_SIZES * NUM_TYPES * 3)) { + // No support for subgroups, so skip these tests + failed_ = true; + _errorMsg = "Subgroup extension not supported"; + return; + } + SNPRINTF(args, sizeof(args), "-cl-std=CL2.0 -D DATA_TYPE=%s", + types[typeIdx_]); + } +#endif + + dstBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, bufSize_, + NULL, &error_); + CHECK_RESULT(dstBuffer_ == 0, "clCreateBuffer(dstBuffer) failed"); + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &device, args, NULL, NULL); + if (error_ != CL_SUCCESS) { + printf("\nerror: %d\n", error_); + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + if (testIdx_ == 0) { + initPipe_ = _wrapper->clCreateKernel(program_, "initPipe", &error_); + CHECK_RESULT(initPipe_ == 0, "clCreateKernel(initPipe) failed"); + copyPipe_ = _wrapper->clCreateKernel(program_, "copyPipe", &error_); + CHECK_RESULT(copyPipe_ == 0, "clCreateKernel(copyPipe) failed"); + readPipe_ = _wrapper->clCreateKernel(program_, "readPipe", &error_); + CHECK_RESULT(readPipe_ == 0, "clCreateKernel(readPipe) failed"); + testName_ = "r/w"; + } else if (testIdx_ == 1) { + initPipe_ = _wrapper->clCreateKernel(program_, "initPipe_reserve", &error_); + CHECK_RESULT(initPipe_ == 0, "clCreateKernel(initPipe) failed"); + copyPipe_ = _wrapper->clCreateKernel(program_, "copyPipe_reserve", &error_); + CHECK_RESULT(copyPipe_ == 0, "clCreateKernel(copyPipe) failed"); + readPipe_ = _wrapper->clCreateKernel(program_, "readPipe_reserve", &error_); + CHECK_RESULT(readPipe_ == 0, "clCreateKernel(readPipe) failed"); + numIter = 10; // Limit iteration count because this test is very slow + testName_ = "r/w w/ reserve"; + } else if (testIdx_ == 2) { + initPipe_ = _wrapper->clCreateKernel(program_, "initPipe_wg", &error_); + CHECK_RESULT(initPipe_ == 0, "clCreateKernel(initPipe) failed"); + copyPipe_ = _wrapper->clCreateKernel(program_, "copyPipe_wg", &error_); + CHECK_RESULT(copyPipe_ == 0, "clCreateKernel(copyPipe) failed"); + readPipe_ = _wrapper->clCreateKernel(program_, "readPipe_wg", &error_); + CHECK_RESULT(readPipe_ == 0, "clCreateKernel(readPipe) failed"); + testName_ = "wg r/w w/ reserve"; + } else if (testIdx_ == 3) { + initPipe_ = _wrapper->clCreateKernel(program_, "initPipe_sg", &error_); + CHECK_RESULT(initPipe_ == 0, "clCreateKernel(initPipe) failed"); + copyPipe_ = _wrapper->clCreateKernel(program_, "copyPipe_sg", &error_); + CHECK_RESULT(copyPipe_ == 0, "clCreateKernel(copyPipe) failed"); + readPipe_ = _wrapper->clCreateKernel(program_, "readPipe_sg", &error_); + CHECK_RESULT(readPipe_ == 0, "clCreateKernel(readPipe) failed"); + testName_ = "sg r/w w/ reserve"; + } else { + CHECK_RESULT(1, "Invalid test index!"); + } + setData(srcBuffer_); +} + +void OCLPerfPipeCopySpeed::run(void) { + if (failed_) return; + CPerfCounter timer; + size_t global_work_size[1] = {(size_t)numElements}; + size_t local_work_size[1] = {64}; + + error_ = _wrapper->clSetKernelArg(initPipe_, 0, sizeof(cl_mem), + (void *)&srcBuffer_); + error_ = + _wrapper->clSetKernelArg(initPipe_, 1, sizeof(cl_mem), (void *)&pipe_[0]); + // Warm up + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, initPipe_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + + error_ = + _wrapper->clSetKernelArg(copyPipe_, 0, sizeof(cl_mem), (void *)&pipe_[0]); + error_ = + _wrapper->clSetKernelArg(copyPipe_, 1, sizeof(cl_mem), (void *)&pipe_[1]); + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, copyPipe_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < numIter; i++) { + error_ = _wrapper->clSetKernelArg(copyPipe_, 0, sizeof(cl_mem), + (void *)&pipe_[(i + 1) % 2]); + error_ = _wrapper->clSetKernelArg(copyPipe_, 1, sizeof(cl_mem), + (void *)&pipe_[i % 2]); + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, copyPipe_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + } + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + + timer.Stop(); + + // pipe[(numIter-1)%2 has the data + error_ = _wrapper->clSetKernelArg(readPipe_, 0, sizeof(cl_mem), + (void *)&pipe_[(numIter - 1) % 2]); + error_ = _wrapper->clSetKernelArg(readPipe_, 1, sizeof(cl_mem), + (void *)&dstBuffer_); + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, readPipe_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel(readPipe) failed"); + error_ = _wrapper->clFinish(cmd_queue_); + checkData(dstBuffer_); + double sec = timer.GetElapsedTime(); + + // Pipe copy total bandwidth in GB/s + double perf = 2. * ((double)bufSize_ * numIter * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; + char buf[256]; + SNPRINTF(buf, sizeof(buf), " %17s (%8d bytes) block size: %2d i:%4d (GB/s) ", + testName_.c_str(), bufSize_, typeSize[typeIdx_], numIter); + testDescString = buf; +} + +unsigned int OCLPerfPipeCopySpeed::close(void) { + if (srcBuffer_) { + error_ = _wrapper->clReleaseMemObject(srcBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(srcBuffer_) failed"); + } + if (pipe_[0]) { + error_ = _wrapper->clReleaseMemObject(pipe_[0]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(pipe_[0]) failed"); + } + if (pipe_[1]) { + error_ = _wrapper->clReleaseMemObject(pipe_[1]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(pipe_[1]) failed"); + } + if (dstBuffer_) { + error_ = _wrapper->clReleaseMemObject(dstBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(dstBuffer_) failed"); + } + + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPipeCopySpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPipeCopySpeed.h new file mode 100644 index 0000000000..e517399d36 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfPipeCopySpeed.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PipeCopySpeed_H_ +#define _OCL_PipeCopySpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerfPipeCopySpeed : public OCLTestImp { + public: + OCLPerfPipeCopySpeed(); + virtual ~OCLPerfPipeCopySpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 100; + void setData(cl_mem buffer); + void checkData(cl_mem buffer); + + cl_command_queue cmd_queue_; + cl_mem srcBuffer_; + cl_mem pipe_[2]; + cl_mem dstBuffer_; + cl_program program_; + cl_kernel initPipe_; + cl_kernel copyPipe_; + cl_kernel readPipe_; + + unsigned int bufSize_; + unsigned int typeIdx_; + unsigned int numElements; + unsigned int numIter; + unsigned int testIdx_; + std::string testName_; + bool subgroupSupport_; + bool failed_; +}; + +#endif // _OCL_PipeCopySpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalRead.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalRead.cpp new file mode 100644 index 0000000000..950958740b --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalRead.cpp @@ -0,0 +1,549 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfProgramGlobalRead.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +static const unsigned int NUM_SIZES = 4; +static const unsigned int NUM_READ_MODES = 6; +// Limit to 32 reads for now +static const unsigned int MAX_READ_MODES = 4; + +static const unsigned int NumReads[NUM_READ_MODES] = {1, 4, 16, 32, 64, 128}; +// 256KB, 1 MB, 4MB, 16 MB +static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304, + 16777216}; +static const unsigned int MaxTypes = 6; +static unsigned int NumTypes = MaxTypes; +static const char *types[MaxTypes] = {"char", "short", "int", + "long", "float", "double"}; +static unsigned int StartType = 0; +static const unsigned int NumVecWidths = + 3; // 5; char8 global scope does not work; bug opened +static const char *vecWidths[NumVecWidths] = {"", "2", "4"}; //, "8", "16"}; +static const unsigned int vecWidths_int[NumVecWidths] = {1, 2, 4}; //, 8, 16}; +static const unsigned int TypeSize[MaxTypes] = { + sizeof(cl_char), sizeof(cl_short), sizeof(cl_int), + sizeof(cl_long), sizeof(cl_float), sizeof(cl_double)}; +#define CHAR_BUF_SIZE 512 + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif +void OCLPerfProgramGlobalRead::genShader(unsigned int type, + unsigned int vecWidth, + unsigned int numReads, + unsigned int bufSize) { + char buf[CHAR_BUF_SIZE]; + + shader_.clear(); + shader_ += + "#ifdef USE_ARENA\n" + "#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\n" + "#endif\n"; + shader_ += + "#ifdef USE_AMD_DOUBLES\n" + "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n" + "#endif\n"; + shader_ += + "#ifdef USE_KHR_DOUBLES\n" + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" + "#endif\n"; + SNPRINTF(buf, CHAR_BUF_SIZE, "__global %s%s gp[%d];\n", types[type], + vecWidths[vecWidth], bufSize); + shader_.append(buf); + SNPRINTF(buf, CHAR_BUF_SIZE, + "__kernel void __attribute__((reqd_work_group_size(64,1,1))) " + "_ReadSpeed(__global %s%s * restrict outBuf, constant uint * " + "restrict constBuf)\n", + types[type], vecWidths[vecWidth]); + shader_.append(buf); + shader_ += + "{\n" + " uint i = (uint) get_global_id(0);\n"; + if (numReads == 1) { + SNPRINTF(buf, CHAR_BUF_SIZE, " %s%s temp = 0;\n", types[type], + vecWidths[vecWidth]); + shader_.append(buf); + shader_ += + " const unsigned int Max = constBuf[0];\n" + " temp = *(gp + i % Max);\n"; + shader_ += + " *(outBuf + i) = temp;\n" + "}\n"; + } else { + SNPRINTF(buf, CHAR_BUF_SIZE, " %s%s temp0 = 0;\n", types[type], + vecWidths[vecWidth]); + shader_.append(buf); + SNPRINTF(buf, CHAR_BUF_SIZE, " %s%s temp1 = 0;\n", types[type], + vecWidths[vecWidth]); + shader_.append(buf); + SNPRINTF(buf, CHAR_BUF_SIZE, " %s%s temp2 = 0;\n", types[type], + vecWidths[vecWidth]); + shader_.append(buf); + SNPRINTF(buf, CHAR_BUF_SIZE, " %s%s temp3 = 0;\n", types[type], + vecWidths[vecWidth]); + shader_.append(buf); + shader_ += + " const unsigned int Max = constBuf[0];\n" + " unsigned int idx0 = (i % Max) + constBuf[1];\n" + " unsigned int idx1 = (i % Max) + constBuf[2];\n" + " unsigned int idx2 = (i % Max) + constBuf[3];\n" + " unsigned int idx3 = (i % Max) + constBuf[4];\n"; + + for (unsigned int i = 0; i < (numReads >> 2); i++) { + shader_ += " temp0 += *(gp + idx0);\n"; + shader_ += " temp1 += *(gp + idx1);\n"; + shader_ += " temp2 += *(gp + idx2);\n"; + shader_ += " temp3 += *(gp + idx3);\n"; + shader_ += " idx0 += constBuf[5];\n"; + shader_ += " idx1 += constBuf[5];\n"; + shader_ += " idx2 += constBuf[5];\n"; + shader_ += " idx3 += constBuf[5];\n"; + } + shader_ += + " *(outBuf + i) = temp0 + temp1 + temp2 + temp3;\n" + "}\n"; + } +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +OCLPerfProgramGlobalRead::OCLPerfProgramGlobalRead() { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + context_ = 0; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + // Get last for default + platform = platforms[numPlatforms - 1]; + for (unsigned i = 0; i < numPlatforms; ++i) { + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, + sizeof(pbuf), pbuf, NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = + _wrapper->clGetDeviceIDs(platforms[i], type_, 0, NULL, &num_devices); + // Runtime returns an error when no GPU devices are present instead of + // just returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + platform = platforms[i]; + break; + } + } + delete platforms; + } + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + char charbuf[1024]; + size_t retsize; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024, + charbuf, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + char *p = strstr(charbuf, "cl_khr_byte_addressable_store"); + char *p2 = strstr(charbuf, "cl_khr_fp64"); + + NumTypes = MaxTypes; + if (!p) { + // No arena ops + NumTypes -= 2; + StartType = 2; + } + if (!p2) { + // Doubles not supported + NumTypes--; + } + _numSubTests = NumTypes * NumVecWidths * NUM_SIZES * MAX_READ_MODES; + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + skip_ = false; +} + +OCLPerfProgramGlobalRead::~OCLPerfProgramGlobalRead() {} + +// Fill with 1s of appropriate type +void OCLPerfProgramGlobalRead::setData(cl_mem buffer, float val) { + void *ptr = + _wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_WRITE, 0, + bufSize_, 0, NULL, NULL, &error_); + switch (typeIdx_) { + case 0: // char + { + char *data = (char *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(char)); i++) + data[i] = (char)val; + break; + } + case 1: // short + { + short *data = (short *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(short)); i++) + data[i] = (short)val; + break; + } + case 2: // int + { + int *data = (int *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(int)); i++) + data[i] = (int)val; + break; + } + case 3: // long + { + cl_long *data = (cl_long *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(cl_long)); i++) + data[i] = (cl_long)val; + break; + } + case 4: // float + { + float *data = (float *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(float)); i++) + data[i] = val; + break; + } + case 5: // double + { + double *data = (double *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(double)); i++) + data[i] = (double)val; + break; + } + default: + // oops + break; + } + error_ = + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL); +} + +void OCLPerfProgramGlobalRead::checkData(cl_mem buffer) { + void *ptr = + _wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_READ, 0, + bufSize_, 0, NULL, NULL, &error_); + switch (typeIdx_) { + case 0: // char + { + char *data = (char *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(char)); i++) { + if (data[i] != (char)numReads_) { + printf("Data validation failed at index %d!\n", i); + printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, + numReads_, numReads_, numReads_, (unsigned int)data[i], + (unsigned int)data[i + 1], (unsigned int)data[i + 2], + (unsigned int)data[i + 3]); + CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n"); + break; + } + } + break; + } + case 1: // short + { + short *data = (short *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(short)); i++) { + if (data[i] != (short)numReads_) { + printf("Data validation failed at index %d!\n", i); + printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, + numReads_, numReads_, numReads_, (unsigned int)data[i], + (unsigned int)data[i + 1], (unsigned int)data[i + 2], + (unsigned int)data[i + 3]); + CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n"); + break; + } + } + break; + } + case 2: // int + { + int *data = (int *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(int)); i++) { + if (data[i] != (int)numReads_) { + printf("Data validation failed at index %d!\n", i); + printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, + numReads_, numReads_, numReads_, (unsigned int)data[i], + (unsigned int)data[i + 1], (unsigned int)data[i + 2], + (unsigned int)data[i + 3]); + CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n"); + break; + } + } + break; + } + case 3: // long + { + cl_long *data = (cl_long *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(cl_long)); i++) { + if (data[i] != (cl_long)numReads_) { + printf("Data validation failed at index %d!\n", i); + printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, + numReads_, numReads_, numReads_, (unsigned int)data[i], + (unsigned int)data[i + 1], (unsigned int)data[i + 2], + (unsigned int)data[i + 3]); + CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n"); + break; + } + } + break; + } + case 4: // float + { + float *data = (float *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(float)); i++) { + if (data[i] != (float)numReads_) { + printf("Data validation failed at index %d!\n", i); + printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, + numReads_, numReads_, numReads_, (unsigned int)data[i], + (unsigned int)data[i + 1], (unsigned int)data[i + 2], + (unsigned int)data[i + 3]); + CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n"); + break; + } + } + break; + } + case 5: // double + { + double *data = (double *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(double)); i++) { + if (data[i] != (double)numReads_) { + printf("Data validation failed at index %d!\n", i); + printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, + numReads_, numReads_, numReads_, (unsigned int)data[i], + (unsigned int)data[i + 1], (unsigned int)data[i + 2], + (unsigned int)data[i + 3]); + CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n"); + break; + } + } + break; + } + default: + // oops + break; + } + error_ = + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL); +} + +void OCLPerfProgramGlobalRead::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + error_ = CL_SUCCESS; + + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + program_ = 0; + kernel_ = 0; + cmd_queue_ = 0; + outBuffer_ = 0; + constBuffer_ = 0; + +#if defined(CL_VERSION_2_0) + cl_device_id device; + numReads_ = NumReads[test % MAX_READ_MODES]; + width_ = Sizes[(test / MAX_READ_MODES) % NUM_SIZES]; + vecSizeIdx_ = (test / (MAX_READ_MODES * NUM_SIZES)) % NumVecWidths; + typeIdx_ = (test / (MAX_READ_MODES * NUM_SIZES * NumVecWidths)) % NumTypes + + StartType; + + bufSize_ = width_; + + cmd_queue_ = cmdQueues_[_deviceId]; + + device = devices_[_deviceId]; + + outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + constBuffer_ = _wrapper->clCreateBuffer(context_, 0, 16 * 2, NULL, &error_); + CHECK_RESULT(constBuffer_ == 0, "clCreateBuffer(constBuffer) failed"); + + genShader(typeIdx_, vecSizeIdx_, numReads_, + bufSize_ / (TypeSize[typeIdx_] * (1 << vecSizeIdx_))); + char *tmp = (char *)shader_.c_str(); + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&tmp, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + std::string args; + args.clear(); + if (typeIdx_ < 2) { + args += "-D USE_ARENA "; + } + args += "-cl-std=CL2.0"; + error_ = + _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL); + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "_ReadSpeed", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer_); + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), + (void *)&constBuffer_); + + setData(outBuffer_, 1.2345678f); + unsigned int *cBuf = (unsigned int *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, constBuffer_, true, CL_MAP_WRITE, 0, 16 * 2, 0, NULL, NULL, + &error_); + // Force all wavefronts to fetch the same data. We are looking for peak speed + // here. + cBuf[0] = 64; + // These values are chosen to assure there is no data reuse within a clause. + // If caching is not working, then the uncached numbers will be low. + cBuf[1] = 0; + cBuf[2] = 64; + cBuf[3] = 128; + cBuf[4] = 192; + cBuf[5] = 0; + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, constBuffer_, cBuf, 0, + NULL, NULL); + _wrapper->clFinish(cmd_queue_); +#else + skip_ = true; + testDescString = + "Program scope globals not supported for < 2.0 builds. Test Skipped."; + return; +#endif +} + +void OCLPerfProgramGlobalRead::run(void) { + if (skip_) { + return; + } +#if defined(CL_VERSION_2_0) + int global = bufSize_ / (TypeSize[typeIdx_] * (1 << vecSizeIdx_)); + int local = 64; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + + CPerfCounter timer; + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < NUM_ITER; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + } + _wrapper->clFinish(cmd_queue_); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Program scope global read bandwidth in GB/s + double perf = + ((double)bufSize_ * numReads_ * NUM_ITER * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; + char buf[256]; + char buf2[256]; + SNPRINTF(buf, sizeof(buf), "%s%s", types[typeIdx_], vecWidths[vecSizeIdx_]); + SNPRINTF(buf2, sizeof(buf2), " %-8s (%8d) %2d reads: (GB/s) ", buf, width_, + numReads_); + testDescString = buf2; + // checkData(outBuffer_); +#endif +} + +unsigned int OCLPerfProgramGlobalRead::close(void) { +#if defined(CL_VERSION_2_0) + if (cmd_queue_) _wrapper->clFinish(cmd_queue_); + + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (constBuffer_) { + error_ = _wrapper->clReleaseMemObject(constBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(constBuffer_) failed"); + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } +#endif + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalRead.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalRead.h new file mode 100644 index 0000000000..bef3e25985 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalRead.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PROGRAMGLOBALREAD_H +#define _OCL_PROGRAMGLOBALREAD_H + +#include "OCLTestImp.h" + +class OCLPerfProgramGlobalRead : public OCLTestImp { + public: + OCLPerfProgramGlobalRead(); + virtual ~OCLPerfProgramGlobalRead(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void genShader(unsigned int type, unsigned int vecWidth, + unsigned int numReads, unsigned int bufSize); + void setData(cl_mem buffer, float data); + void checkData(cl_mem buffer); + + static const unsigned int NUM_ITER = 100; + + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + cl_mem outBuffer_; + cl_mem constBuffer_; + + unsigned int width_; + unsigned int bufSize_; + unsigned int vecSizeIdx_; + unsigned int numReads_; + unsigned int typeIdx_; + + bool skip_; +}; + +#endif // _OCL_PROGRAMGLOBALREAD_H diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalWrite.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalWrite.cpp new file mode 100644 index 0000000000..a26d4caa24 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalWrite.cpp @@ -0,0 +1,384 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfProgramGlobalWrite.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +static const unsigned int NUM_SIZES = 4; +static const unsigned int NUM_READ_MODES = 6; +// Limit to 32 reads for now +static const unsigned int MAX_READ_MODES = 4; + +static const unsigned int NumReads[NUM_READ_MODES] = {1, 4, 16, 32, 64, 128}; +// 256KB, 1 MB, 4MB, 16 MB +static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304, + 16777216}; +static const unsigned int MaxTypes = 6; +static unsigned int NumTypes = MaxTypes; +static const char *types[MaxTypes] = {"char", "short", "int", + "long", "float", "double"}; +static unsigned int StartType = 0; +static const unsigned int NumVecWidths = + 3; // 5; char8 global scope does not work; bug opened +static const char *vecWidths[NumVecWidths] = {"", "2", "4"}; //, "8", "16"}; +static const unsigned int vecWidths_int[NumVecWidths] = {1, 2, 4}; //, 8, 16}; +static const unsigned int TypeSize[MaxTypes] = { + sizeof(cl_char), sizeof(cl_short), sizeof(cl_int), + sizeof(cl_long), sizeof(cl_float), sizeof(cl_double)}; +#define CHAR_BUF_SIZE 512 + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif +void OCLPerfProgramGlobalWrite::genShader(unsigned int type, + unsigned int vecWidth, + unsigned int numReads, + unsigned int bufSize) { + char buf[CHAR_BUF_SIZE]; + + shader_.clear(); + shader_ += + "#ifdef USE_ARENA\n" + "#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\n" + "#endif\n"; + shader_ += + "#ifdef USE_AMD_DOUBLES\n" + "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n" + "#endif\n"; + shader_ += + "#ifdef USE_KHR_DOUBLES\n" + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" + "#endif\n"; + SNPRINTF(buf, CHAR_BUF_SIZE, "__global %s%s gp[%d];\n", types[type], + vecWidths[vecWidth], bufSize); + shader_.append(buf); + SNPRINTF(buf, CHAR_BUF_SIZE, + "__kernel void __attribute__((reqd_work_group_size(64,1,1))) " + "_WriteSpeed(constant uint * restrict constBuf)\n"); + shader_.append(buf); + shader_ += + "{\n" + " uint i = (uint) get_global_id(0);\n"; + if (numReads == 1) { + SNPRINTF(buf, CHAR_BUF_SIZE, " %s%s temp = 0;\n", types[type], + vecWidths[vecWidth]); + shader_.append(buf); + shader_ += " const unsigned int Max = constBuf[0];\n"; + shader_ += + " *(gp + i % Max) = 0;\n" + "}\n"; + } else { + SNPRINTF(buf, CHAR_BUF_SIZE, " %s%s temp0 = 0;\n", types[type], + vecWidths[vecWidth]); + shader_.append(buf); + SNPRINTF(buf, CHAR_BUF_SIZE, " %s%s temp1 = 0;\n", types[type], + vecWidths[vecWidth]); + shader_.append(buf); + SNPRINTF(buf, CHAR_BUF_SIZE, " %s%s temp2 = 0;\n", types[type], + vecWidths[vecWidth]); + shader_.append(buf); + SNPRINTF(buf, CHAR_BUF_SIZE, " %s%s temp3 = 0;\n", types[type], + vecWidths[vecWidth]); + shader_.append(buf); + shader_ += + " const unsigned int Max = constBuf[0];\n" + " unsigned int idx0 = (i % Max) + constBuf[1];\n" + " unsigned int idx1 = (i % Max) + constBuf[2];\n" + " unsigned int idx2 = (i % Max) + constBuf[3];\n" + " unsigned int idx3 = (i % Max) + constBuf[4];\n"; + + for (unsigned int i = 0; i < (numReads >> 2); i++) { + shader_ += " *(gp + idx0) = idx0;\n"; + shader_ += " *(gp + idx1) = idx1;\n"; + shader_ += " *(gp + idx2) = idx2;\n"; + shader_ += " *(gp + idx3) = idx3;\n"; + shader_ += " idx0 += constBuf[5];\n"; + shader_ += " idx1 += constBuf[5];\n"; + shader_ += " idx2 += constBuf[5];\n"; + shader_ += " idx3 += constBuf[5];\n"; + } + shader_ += "}\n"; + } + SNPRINTF(buf, CHAR_BUF_SIZE, "__kernel void __dummyRead(global %s%s *in)\n", + types[type], vecWidths[vecWidth]); + shader_.append(buf); + shader_ += + "{\n" + " uint i = (uint) get_global_id(0);\n"; + SNPRINTF(buf, CHAR_BUF_SIZE, " in[i] = gp[i];\n"); + shader_.append(buf); + shader_ += "}\n"; +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +OCLPerfProgramGlobalWrite::OCLPerfProgramGlobalWrite() { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + context_ = 0; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + // Get last for default + platform = platforms[numPlatforms - 1]; + for (unsigned i = 0; i < numPlatforms; ++i) { + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, + sizeof(pbuf), pbuf, NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = + _wrapper->clGetDeviceIDs(platforms[i], type_, 0, NULL, &num_devices); + // Runtime returns an error when no GPU devices are present instead of + // just returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + platform = platforms[i]; + break; + } + } + delete platforms; + } + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + char charbuf[1024]; + size_t retsize; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024, + charbuf, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + char *p = strstr(charbuf, "cl_khr_byte_addressable_store"); + char *p2 = strstr(charbuf, "cl_khr_fp64"); + + NumTypes = MaxTypes; + if (!p) { + // No arena ops + NumTypes -= 2; + StartType = 2; + } + if (!p2) { + // Doubles not supported + NumTypes--; + } + _numSubTests = NumTypes * NumVecWidths * NUM_SIZES * MAX_READ_MODES; + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + skip_ = false; +} + +OCLPerfProgramGlobalWrite::~OCLPerfProgramGlobalWrite() {} + +void OCLPerfProgramGlobalWrite::open(unsigned int test, char *units, + double &conversion, + unsigned int deviceId) { + error_ = CL_SUCCESS; + + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + program_ = 0; + kernel_ = 0; + cmd_queue_ = 0; + outBuffer_ = 0; + constBuffer_ = 0; + +#if defined(CL_VERSION_2_0) + cl_device_id device; + numReads_ = NumReads[test % MAX_READ_MODES]; + width_ = Sizes[(test / MAX_READ_MODES) % NUM_SIZES]; + vecSizeIdx_ = (test / (MAX_READ_MODES * NUM_SIZES)) % NumVecWidths; + typeIdx_ = (test / (MAX_READ_MODES * NUM_SIZES * NumVecWidths)) % NumTypes + + StartType; + + bufSize_ = width_; + + cmd_queue_ = cmdQueues_[_deviceId]; + + device = devices_[_deviceId]; + + outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + constBuffer_ = _wrapper->clCreateBuffer(context_, 0, 16 * 2, NULL, &error_); + CHECK_RESULT(constBuffer_ == 0, "clCreateBuffer(constBuffer) failed"); + + genShader(typeIdx_, vecSizeIdx_, numReads_, + bufSize_ / (TypeSize[typeIdx_] * (1 << vecSizeIdx_))); + char *tmp = (char *)shader_.c_str(); + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&tmp, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + std::string args; + args.clear(); + if (typeIdx_ < 2) { + args += "-D USE_ARENA "; + } + args += "-cl-std=CL2.0"; + error_ = + _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL); + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "_WriteSpeed", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), + (void *)&constBuffer_); + + unsigned int *cBuf = (unsigned int *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, constBuffer_, true, CL_MAP_WRITE, 0, 16 * 2, 0, NULL, NULL, + &error_); + // Force all wavefronts to fetch the same data. We are looking for peak speed + // here. + cBuf[0] = 64; + // These values are chosen to assure there is no data reuse within a clause. + // If caching is not working, then the uncached numbers will be low. + cBuf[1] = 0; + cBuf[2] = 64; + cBuf[3] = 128; + cBuf[4] = 192; + cBuf[5] = 0; + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, constBuffer_, cBuf, 0, + NULL, NULL); + _wrapper->clFinish(cmd_queue_); +#else + skip_ = true; + testDescString = + "Program scope globals not supported for < 2.0 builds. Test Skipped."; + return; +#endif +} + +void OCLPerfProgramGlobalWrite::run(void) { + if (skip_) { + return; + } +#if defined(CL_VERSION_2_0) + int global = bufSize_ / (TypeSize[typeIdx_] * (1 << vecSizeIdx_)); + int local = 64; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + + CPerfCounter timer; + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < NUM_ITER; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + } + _wrapper->clFinish(cmd_queue_); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Program scope global write bandwidth in GB/s + double perf = + ((double)bufSize_ * numReads_ * NUM_ITER * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; + char buf[256]; + char buf2[256]; + SNPRINTF(buf, sizeof(buf), "%s%s", types[typeIdx_], vecWidths[vecSizeIdx_]); + SNPRINTF(buf2, sizeof(buf2), " %-8s (%8d) %2d reads: (GB/s) ", buf, width_, + numReads_); + testDescString = buf2; +#endif +} + +unsigned int OCLPerfProgramGlobalWrite::close(void) { +#if defined(CL_VERSION_2_0) + if (cmd_queue_) _wrapper->clFinish(cmd_queue_); + + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (constBuffer_) { + error_ = _wrapper->clReleaseMemObject(constBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(constBuffer_) failed"); + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } +#endif + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalWrite.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalWrite.h new file mode 100644 index 0000000000..6102bb7428 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfProgramGlobalWrite.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PROGRAMGLOBALWRITE_H_ +#define _OCL_PROGRAMGLOBALWRITE_H_ + +#include "OCLTestImp.h" + +class OCLPerfProgramGlobalWrite : public OCLTestImp { + public: + OCLPerfProgramGlobalWrite(); + virtual ~OCLPerfProgramGlobalWrite(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void genShader(unsigned int type, unsigned int vecWidth, + unsigned int numReads, unsigned int bufSize); + + static const unsigned int NUM_ITER = 100; + + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + cl_mem outBuffer_; + cl_mem constBuffer_; + + unsigned int width_; + unsigned int bufSize_; + unsigned int vecSizeIdx_; + unsigned int numReads_; + unsigned int typeIdx_; + + bool skip_; +}; + +#endif // _OCL_PROGRAMGLOBALWRITE_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSHA256.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSHA256.cpp new file mode 100644 index 0000000000..9cea4518d8 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSHA256.cpp @@ -0,0 +1,841 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfSHA256.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +static const char *sha256_kernel = + "typedef uint UINT;\n" + "\n" + "#define VECTOR_LEN 1\n" + "\n" + "#ifdef LITTLE_E\n" + "\n" + "inline UINT byteswap(UINT x)\n" + "{\n" + " UINT res = 0;\n" + " \n" + " for (uint i=0; i<4; i++)\n" + " {\n" + " res <<= 8;\n" + " res |= (x & 0xff);\n" + " x >>= 8;\n" + " }\n" + " \n" + " return res;\n" + "}\n" + "\n" + "#else\n" + "\n" + "inline UINT byteswap(const UINT x)\n" + "{\n" + " return x;\n" + "}\n" + "\n" + "#endif\n" + "\n" + "\n" + "void sha256_step( const UINT data[16], UINT *state )\n" + "{\n" + " UINT W[64], temp1, temp2;\n" + " UINT A, B, C, D, E, F, G, H;\n" + "\n" + " for( int i = 0; i < 16; i++)\n" + " {\n" + " W[i] = byteswap(data[i]);\n" + " }\n" + "\n" + "#define SHR(x,n) ((x & 0xFFFFFFFF) >> n)\n" + "#define ROTR(x,n) (SHR(x,n) | (x << (32 - n)))\n" + "\n" + "#define S0(x) (ROTR(x, 7) ^ ROTR(x,18) ^ SHR(x, 3))\n" + "#define S1(x) (ROTR(x,17) ^ ROTR(x,19) ^ SHR(x,10))\n" + "\n" + "#define S2(x) (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))\n" + "#define S3(x) (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25))\n" + "\n" + "#define F0(x,y,z) ((x & y) | (z & (x | y)))\n" + "#define F1(x,y,z) (z ^ (x & (y ^ z)))\n" + "\n" + "#define R(t) \\\n" + "( \\\n" + " W[t] = S1(W[t - 2]) + W[t - 7] + \\\n" + " S0(W[t - 15]) + W[t - 16] \\\n" + ")\n" + "\n" + "#define P(a,b,c,d,e,f,g,h,x,K) \\\n" + "{ \\\n" + " temp1 = h + S3(e) + F1(e,f,g) + K + x; \\\n" + " temp2 = S2(a) + F0(a,b,c); \\\n" + " d += temp1; h = temp1 + temp2; \\\n" + "}\n" + "\n" + " A = state[0];\n" + " B = state[1];\n" + " C = state[2];\n" + " D = state[3];\n" + " E = state[4];\n" + " F = state[5];\n" + " G = state[6];\n" + " H = state[7];\n" + "\n" + " P( A, B, C, D, E, F, G, H, W[ 0], 0x428A2F98 );\n" + " P( H, A, B, C, D, E, F, G, W[ 1], 0x71374491 );\n" + " P( G, H, A, B, C, D, E, F, W[ 2], 0xB5C0FBCF );\n" + " P( F, G, H, A, B, C, D, E, W[ 3], 0xE9B5DBA5 );\n" + " P( E, F, G, H, A, B, C, D, W[ 4], 0x3956C25B );\n" + " P( D, E, F, G, H, A, B, C, W[ 5], 0x59F111F1 );\n" + " P( C, D, E, F, G, H, A, B, W[ 6], 0x923F82A4 );\n" + " P( B, C, D, E, F, G, H, A, W[ 7], 0xAB1C5ED5 );\n" + " P( A, B, C, D, E, F, G, H, W[ 8], 0xD807AA98 );\n" + " P( H, A, B, C, D, E, F, G, W[ 9], 0x12835B01 );\n" + " P( G, H, A, B, C, D, E, F, W[10], 0x243185BE );\n" + " P( F, G, H, A, B, C, D, E, W[11], 0x550C7DC3 );\n" + " P( E, F, G, H, A, B, C, D, W[12], 0x72BE5D74 );\n" + " P( D, E, F, G, H, A, B, C, W[13], 0x80DEB1FE );\n" + " P( C, D, E, F, G, H, A, B, W[14], 0x9BDC06A7 );\n" + " P( B, C, D, E, F, G, H, A, W[15], 0xC19BF174 );\n" + " P( A, B, C, D, E, F, G, H, R(16), 0xE49B69C1 );\n" + " P( H, A, B, C, D, E, F, G, R(17), 0xEFBE4786 );\n" + " P( G, H, A, B, C, D, E, F, R(18), 0x0FC19DC6 );\n" + " P( F, G, H, A, B, C, D, E, R(19), 0x240CA1CC );\n" + " P( E, F, G, H, A, B, C, D, R(20), 0x2DE92C6F );\n" + " P( D, E, F, G, H, A, B, C, R(21), 0x4A7484AA );\n" + " P( C, D, E, F, G, H, A, B, R(22), 0x5CB0A9DC );\n" + " P( B, C, D, E, F, G, H, A, R(23), 0x76F988DA );\n" + " P( A, B, C, D, E, F, G, H, R(24), 0x983E5152 );\n" + " P( H, A, B, C, D, E, F, G, R(25), 0xA831C66D );\n" + " P( G, H, A, B, C, D, E, F, R(26), 0xB00327C8 );\n" + " P( F, G, H, A, B, C, D, E, R(27), 0xBF597FC7 );\n" + " P( E, F, G, H, A, B, C, D, R(28), 0xC6E00BF3 );\n" + " P( D, E, F, G, H, A, B, C, R(29), 0xD5A79147 );\n" + " P( C, D, E, F, G, H, A, B, R(30), 0x06CA6351 );\n" + " P( B, C, D, E, F, G, H, A, R(31), 0x14292967 );\n" + " P( A, B, C, D, E, F, G, H, R(32), 0x27B70A85 );\n" + " P( H, A, B, C, D, E, F, G, R(33), 0x2E1B2138 );\n" + " P( G, H, A, B, C, D, E, F, R(34), 0x4D2C6DFC );\n" + " P( F, G, H, A, B, C, D, E, R(35), 0x53380D13 );\n" + " P( E, F, G, H, A, B, C, D, R(36), 0x650A7354 );\n" + " P( D, E, F, G, H, A, B, C, R(37), 0x766A0ABB );\n" + " P( C, D, E, F, G, H, A, B, R(38), 0x81C2C92E );\n" + " P( B, C, D, E, F, G, H, A, R(39), 0x92722C85 );\n" + " P( A, B, C, D, E, F, G, H, R(40), 0xA2BFE8A1 );\n" + " P( H, A, B, C, D, E, F, G, R(41), 0xA81A664B );\n" + " P( G, H, A, B, C, D, E, F, R(42), 0xC24B8B70 );\n" + " P( F, G, H, A, B, C, D, E, R(43), 0xC76C51A3 );\n" + " P( E, F, G, H, A, B, C, D, R(44), 0xD192E819 );\n" + " P( D, E, F, G, H, A, B, C, R(45), 0xD6990624 );\n" + " P( C, D, E, F, G, H, A, B, R(46), 0xF40E3585 );\n" + " P( B, C, D, E, F, G, H, A, R(47), 0x106AA070 );\n" + " P( A, B, C, D, E, F, G, H, R(48), 0x19A4C116 );\n" + " P( H, A, B, C, D, E, F, G, R(49), 0x1E376C08 );\n" + " P( G, H, A, B, C, D, E, F, R(50), 0x2748774C );\n" + " P( F, G, H, A, B, C, D, E, R(51), 0x34B0BCB5 );\n" + " P( E, F, G, H, A, B, C, D, R(52), 0x391C0CB3 );\n" + " P( D, E, F, G, H, A, B, C, R(53), 0x4ED8AA4A );\n" + " P( C, D, E, F, G, H, A, B, R(54), 0x5B9CCA4F );\n" + " P( B, C, D, E, F, G, H, A, R(55), 0x682E6FF3 );\n" + " P( A, B, C, D, E, F, G, H, R(56), 0x748F82EE );\n" + " P( H, A, B, C, D, E, F, G, R(57), 0x78A5636F );\n" + " P( G, H, A, B, C, D, E, F, R(58), 0x84C87814 );\n" + " P( F, G, H, A, B, C, D, E, R(59), 0x8CC70208 );\n" + " P( E, F, G, H, A, B, C, D, R(60), 0x90BEFFFA );\n" + " P( D, E, F, G, H, A, B, C, R(61), 0xA4506CEB );\n" + " P( C, D, E, F, G, H, A, B, R(62), 0xBEF9A3F7 );\n" + " P( B, C, D, E, F, G, H, A, R(63), 0xC67178F2 );\n" + "\n" + " state[0] += A;\n" + " state[1] += B;\n" + " state[2] += C;\n" + " state[3] += D;\n" + " state[4] += E;\n" + " state[5] += F;\n" + " state[6] += G;\n" + " state[7] += H;\n" + "}\n" + "\n" + "\n" + "#define choose_temp(x) ((x)/16)\n" + "\n" + "#define STORE_TO_TEMP(i) tb[((i)/16)][((i)%16)]\n" + "\n" + "\n" + "__kernel void CryptThread(__global const uint *buffer, __global uint " + "*state, const uint blockLen, const uint foo)\n" + "{\n" + " const uint init[8] = {\n" + " 0x6a09e667,\n" + " 0xbb67ae85,\n" + " 0x3c6ef372,\n" + " 0xa54ff53a,\n" + " 0x510e527f,\n" + " 0x9b05688c,\n" + " 0x1f83d9ab,\n" + " 0x5be0cd19\n" + " };\n" + " \n" + " const uint id = get_global_id(0);\n" + " uint len = blockLen;\n" + " uint i, j;\n" + " const uint startPosInDWORDs = (len*id*foo)/4;\n" + " const uint msgLenInBitsl = len * 8;\n" + " const uint msgLenInBitsh = (len) >> (32-3);\n" + " UINT localState[8];\n" + "\n" + " for (j=0; j<8; j++) {\n" + " localState[j] = init[j];\n" + " }\n" + "\n" + " i = 0;\n" + " while (len >=64)\n" + " {\n" + " UINT data[16];\n" + " for (j=0; j<16; j++) {\n" + " data[j] = buffer[j + startPosInDWORDs + i];\n" + " }\n" + "\n" + " sha256_step(data, localState);\n" + " i += 16;\n" + " len -= 64;\n" + " }\n" + "\n" + " len /= 4;\n" + "\n" + " UINT tb[2][16];\n" + "\n" + " for (j=0; j>= 8;\n" + " }\n" + " \n" + " return res;\n" + "}\n" + "\n" + "#else\n" + "\n" + "inline UINT byteswap(const UINT x)\n" + "{\n" + " return x;\n" + "}\n" + "\n" + "#endif\n" + "\n" + "\n" + "void sha256_step( const UINT data[16], UINT *state )\n" + "{\n" + " UINT W[64], temp1, temp2;\n" + " UINT A, B, C, D, E, F, G, H;\n" + "\n" + " for( int i = 0; i < 16; i++)\n" + " {\n" + " W[i] = byteswap(data[i]);\n" + " }\n" + "\n" + "#define SHR(x,n) ((x & 0xFFFFFFFF) >> n)\n" + "#define ROTR(x,n) (SHR(x,n) | (x << (32 - n)))\n" + "\n" + "#define S0(x) (ROTR(x, 7) ^ ROTR(x,18) ^ SHR(x, 3))\n" + "#define S1(x) (ROTR(x,17) ^ ROTR(x,19) ^ SHR(x,10))\n" + "\n" + "#define S2(x) (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))\n" + "#define S3(x) (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25))\n" + "\n" + "#define F0(x,y,z) ((x & y) | (z & (x | y)))\n" + "#define F1(x,y,z) (z ^ (x & (y ^ z)))\n" + "\n" + "#define R(t) \\\n" + "( \\\n" + " W[t] = S1(W[t - 2]) + W[t - 7] + \\\n" + " S0(W[t - 15]) + W[t - 16] \\\n" + ")\n" + "\n" + "#define P(a,b,c,d,e,f,g,h,x,K) \\\n" + "{ \\\n" + " temp1 = h + S3(e) + F1(e,f,g) + K + x; \\\n" + " temp2 = S2(a) + F0(a,b,c); \\\n" + " d += temp1; h = temp1 + temp2; \\\n" + "}\n" + "\n" + " A = state[0];\n" + " B = state[1];\n" + " C = state[2];\n" + " D = state[3];\n" + " E = state[4];\n" + " F = state[5];\n" + " G = state[6];\n" + " H = state[7];\n" + "\n" + " P( A, B, C, D, E, F, G, H, W[ 0], 0x428A2F98 );\n" + " P( H, A, B, C, D, E, F, G, W[ 1], 0x71374491 );\n" + " P( G, H, A, B, C, D, E, F, W[ 2], 0xB5C0FBCF );\n" + " P( F, G, H, A, B, C, D, E, W[ 3], 0xE9B5DBA5 );\n" + " P( E, F, G, H, A, B, C, D, W[ 4], 0x3956C25B );\n" + " P( D, E, F, G, H, A, B, C, W[ 5], 0x59F111F1 );\n" + " P( C, D, E, F, G, H, A, B, W[ 6], 0x923F82A4 );\n" + " P( B, C, D, E, F, G, H, A, W[ 7], 0xAB1C5ED5 );\n" + " P( A, B, C, D, E, F, G, H, W[ 8], 0xD807AA98 );\n" + " P( H, A, B, C, D, E, F, G, W[ 9], 0x12835B01 );\n" + " P( G, H, A, B, C, D, E, F, W[10], 0x243185BE );\n" + " P( F, G, H, A, B, C, D, E, W[11], 0x550C7DC3 );\n" + " P( E, F, G, H, A, B, C, D, W[12], 0x72BE5D74 );\n" + " P( D, E, F, G, H, A, B, C, W[13], 0x80DEB1FE );\n" + " P( C, D, E, F, G, H, A, B, W[14], 0x9BDC06A7 );\n" + " P( B, C, D, E, F, G, H, A, W[15], 0xC19BF174 );\n" + " P( A, B, C, D, E, F, G, H, R(16), 0xE49B69C1 );\n" + " P( H, A, B, C, D, E, F, G, R(17), 0xEFBE4786 );\n" + " P( G, H, A, B, C, D, E, F, R(18), 0x0FC19DC6 );\n" + " P( F, G, H, A, B, C, D, E, R(19), 0x240CA1CC );\n" + " P( E, F, G, H, A, B, C, D, R(20), 0x2DE92C6F );\n" + " P( D, E, F, G, H, A, B, C, R(21), 0x4A7484AA );\n" + " P( C, D, E, F, G, H, A, B, R(22), 0x5CB0A9DC );\n" + " P( B, C, D, E, F, G, H, A, R(23), 0x76F988DA );\n" + " P( A, B, C, D, E, F, G, H, R(24), 0x983E5152 );\n" + " P( H, A, B, C, D, E, F, G, R(25), 0xA831C66D );\n" + " P( G, H, A, B, C, D, E, F, R(26), 0xB00327C8 );\n" + " P( F, G, H, A, B, C, D, E, R(27), 0xBF597FC7 );\n" + " P( E, F, G, H, A, B, C, D, R(28), 0xC6E00BF3 );\n" + " P( D, E, F, G, H, A, B, C, R(29), 0xD5A79147 );\n" + " P( C, D, E, F, G, H, A, B, R(30), 0x06CA6351 );\n" + " P( B, C, D, E, F, G, H, A, R(31), 0x14292967 );\n" + " P( A, B, C, D, E, F, G, H, R(32), 0x27B70A85 );\n" + " P( H, A, B, C, D, E, F, G, R(33), 0x2E1B2138 );\n" + " P( G, H, A, B, C, D, E, F, R(34), 0x4D2C6DFC );\n" + " P( F, G, H, A, B, C, D, E, R(35), 0x53380D13 );\n" + " P( E, F, G, H, A, B, C, D, R(36), 0x650A7354 );\n" + " P( D, E, F, G, H, A, B, C, R(37), 0x766A0ABB );\n" + " P( C, D, E, F, G, H, A, B, R(38), 0x81C2C92E );\n" + " P( B, C, D, E, F, G, H, A, R(39), 0x92722C85 );\n" + " P( A, B, C, D, E, F, G, H, R(40), 0xA2BFE8A1 );\n" + " P( H, A, B, C, D, E, F, G, R(41), 0xA81A664B );\n" + " P( G, H, A, B, C, D, E, F, R(42), 0xC24B8B70 );\n" + " P( F, G, H, A, B, C, D, E, R(43), 0xC76C51A3 );\n" + " P( E, F, G, H, A, B, C, D, R(44), 0xD192E819 );\n" + " P( D, E, F, G, H, A, B, C, R(45), 0xD6990624 );\n" + " P( C, D, E, F, G, H, A, B, R(46), 0xF40E3585 );\n" + " P( B, C, D, E, F, G, H, A, R(47), 0x106AA070 );\n" + " P( A, B, C, D, E, F, G, H, R(48), 0x19A4C116 );\n" + " P( H, A, B, C, D, E, F, G, R(49), 0x1E376C08 );\n" + " P( G, H, A, B, C, D, E, F, R(50), 0x2748774C );\n" + " P( F, G, H, A, B, C, D, E, R(51), 0x34B0BCB5 );\n" + " P( E, F, G, H, A, B, C, D, R(52), 0x391C0CB3 );\n" + " P( D, E, F, G, H, A, B, C, R(53), 0x4ED8AA4A );\n" + " P( C, D, E, F, G, H, A, B, R(54), 0x5B9CCA4F );\n" + " P( B, C, D, E, F, G, H, A, R(55), 0x682E6FF3 );\n" + " P( A, B, C, D, E, F, G, H, R(56), 0x748F82EE );\n" + " P( H, A, B, C, D, E, F, G, R(57), 0x78A5636F );\n" + " P( G, H, A, B, C, D, E, F, R(58), 0x84C87814 );\n" + " P( F, G, H, A, B, C, D, E, R(59), 0x8CC70208 );\n" + " P( E, F, G, H, A, B, C, D, R(60), 0x90BEFFFA );\n" + " P( D, E, F, G, H, A, B, C, R(61), 0xA4506CEB );\n" + " P( C, D, E, F, G, H, A, B, R(62), 0xBEF9A3F7 );\n" + " P( B, C, D, E, F, G, H, A, R(63), 0xC67178F2 );\n" + "\n" + " state[0] += A;\n" + " state[1] += B;\n" + " state[2] += C;\n" + " state[3] += D;\n" + " state[4] += E;\n" + " state[5] += F;\n" + " state[6] += G;\n" + " state[7] += H;\n" + "}\n" + "\n" + "\n" + "#define choose_temp(x) ((x)/16)\n" + "\n" + "#define STORE_TO_TEMP(i) tb[((i)/16)][((i)%16)]\n" + "\n" + "#define WAVEFRONT_SIZE 64\n" + "\n" + "__kernel void CryptThread(__global const uint *buffer, __global uint " + "*state, const uint blockLen, const uint foo)\n" + "{\n" + " const uint init[8] = {\n" + " 0x6a09e667,\n" + " 0xbb67ae85,\n" + " 0x3c6ef372,\n" + " 0xa54ff53a,\n" + " 0x510e527f,\n" + " 0x9b05688c,\n" + " 0x1f83d9ab,\n" + " 0x5be0cd19\n" + " };\n" + " \n" + " const uint id = get_global_id(0);\n" + " const uint lid = get_local_id(0);\n" + " uint len = blockLen;\n" + " uint i, j;\n" + " const uint startPosInDWORDs = (len*id*foo)/4;\n" + "uint blockStartInDWORDs = (len*(id / WAVEFRONT_SIZE)*WAVEFRONT_SIZE)/4;\n" + " const uint msgLenInBitsl = len * 8;\n" + " const uint msgLenInBitsh = (len) >> (32-3);\n" + " UINT localState[8];\n" + "\n" + " for (j=0; j<8; j++) {\n" + " localState[j] = init[j];\n" + " }\n" + "\n" + " i = 0;\n" + " while (len >=64)\n" + " {\n" + " UINT data[16];\n" + " for (j=0; j<16; j++) {\n" + " //data[j] = buffer[j + startPosInDWORDs + i];\n" + " data[j] = buffer[j*WAVEFRONT_SIZE + blockStartInDWORDs " + "+ i*WAVEFRONT_SIZE + lid];\n" + " }\n" + "\n" + " sha256_step(data, localState);\n" + " i += 16;\n" + " len -= 64;\n" + " }\n" + "\n" + " len /= 4;\n" + "\n" + " UINT tb[2][16];\n" + "\n" + " for (j=0; jclEnqueueMapBuffer( + cmd_queue_, buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL, + &error_); + + if (error_ != CL_SUCCESS) { + printf("\nError code : %d\n", error_); + } else { + for (unsigned int i = 0; i < width_; i++) data[i] = val; + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, + NULL, NULL); + if (error_ == CL_SUCCESS) retVal = true; + } + return retVal; +} + +void OCLPerfSHA256::checkData(cl_mem buffer) { + unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL, + &error_); + for (unsigned int i = 0; i < width_; i++) { + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfSHA256::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + program_ = 0; + kernel_ = 0; + inBuffer_ = 0; + outBuffer_ = 0; + num_input_buf_ = 1; + num_output_buf_ = 1; + blockSize_ = 1024; + isAMD = false; + + width_ = 22347776; + // We compute a square domain + bufSize_ = width_ * sizeof(cl_uint); + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + isAMD = true; + } + // platform = platforms[_platformIndex]; + // break; + } +#if 0 + } +#endif + delete platforms; + } + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, + "Couldn't find platform with GPU devices, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + char charbuf[1024]; + size_t retsize; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024, + charbuf, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + switch (_openTest % NUM_BUF_TYPES) { + case 0: + num_input_buf_ = 1; + num_output_buf_ = 1; + break; + + case 1: + num_input_buf_ = 1; + num_output_buf_ = 4; + break; + + case 2: + num_input_buf_ = 4; + num_output_buf_ = 4; + break; + }; + + inBuffer_ = new cl_mem[num_input_buf_]; + outBuffer_ = new cl_mem[num_output_buf_]; + + for (int i = 0; i < num_input_buf_; ++i) { + inBuffer_[i] = + _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(inBuffer_[i] == 0, "clCreateBuffer(inBuffer) failed"); + bool result = setData(inBuffer_[i], 0xdeadbeef); + CHECK_RESULT(result != true, "clEnqueueMapBuffer buffer failed"); + } + + for (int i = 0; i < num_output_buf_; ++i) { + outBuffer_[i] = + _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(outBuffer_[i] == 0, "clCreateBuffer(outBuffer) failed"); + bool result = setData(outBuffer_[i], 0xdeadbeef); + CHECK_RESULT(result != true, "clEnqueueMapBuffer buffer failed"); + } + + if (_openTest >= NUM_BUF_TYPES) { + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&sha256_opt_kernel, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + } else { + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&sha256_kernel, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + } + + const char *buildOps = NULL; + if (isAMD) { + // Enable caching + buildOps = "-fno-alias"; + } + error_ = _wrapper->clBuildProgram(program_, 1, &device, buildOps, NULL, NULL); + + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "CryptThread", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), + (void *)&inBuffer_[0]); + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), + (void *)&outBuffer_[0]); + error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint), + (void *)&blockSize_); + // Foo is not part of the original test, this can be used to see how much of + // the performance is limited by fetch. Set foo to 0 and all threads will + // fetch the same 1k block. This way they will all be in cache and hit max + // fetch speed. + unsigned int foo = 1; + error_ = _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_uint), (void *)&foo); +} + +void OCLPerfSHA256::run(void) { + int global = bufSize_ / blockSize_; + // 32 gives the best result due to memory thrashing. Need to optimize and + // give feedback to SiSoft. + int local = 64; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + + // Warm-up + for (unsigned int i = 0; i < 10; i++) { + if (num_input_buf_ > 1) { + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), + (void *)&inBuffer_[i % num_input_buf_]); + } + + if (num_output_buf_ > 1) { + error_ = _wrapper->clSetKernelArg( + kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_[i % num_output_buf_]); + } + + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + } + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + _wrapper->clFinish(cmd_queue_); + + CPerfCounter timer; + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < MAX_ITERATIONS; i++) { + if (num_input_buf_ > 1) { + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), + (void *)&inBuffer_[i % num_input_buf_]); + } + + if (num_output_buf_ > 1) { + error_ = _wrapper->clSetKernelArg( + kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_[i % num_output_buf_]); + } + + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + } + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + _wrapper->clFinish(cmd_queue_); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // No idea what data should be in here + // checkData(outBuffer_); + // Compute GB/s + double perf = + ((double)bufSize_ * (double)MAX_ITERATIONS * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; + if (_openTest >= NUM_BUF_TYPES) { + testDescString = "opt "; + } else { + testDescString = "def "; + } + + testDescString += "with "; + char str[40]; + sprintf(str, "%2d ip buff and %2d op buff ", num_input_buf_, num_output_buf_); + testDescString += str; +} + +unsigned int OCLPerfSHA256::close(void) { + _wrapper->clFinish(cmd_queue_); + + if (inBuffer_) { + for (int i = 0; i < num_input_buf_; ++i) { + error_ = _wrapper->clReleaseMemObject(inBuffer_[i]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(inBuffer_) failed"); + } + delete[] inBuffer_; + } + if (outBuffer_) { + for (int i = 0; i < num_output_buf_; ++i) { + error_ = _wrapper->clReleaseMemObject(outBuffer_[i]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + delete[] outBuffer_; + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSHA256.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSHA256.h new file mode 100644 index 0000000000..60d62efbe5 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSHA256.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_SHA256_H_ +#define _OCL_SHA256_H_ + +#include "OCLTestImp.h" + +class OCLPerfSHA256 : public OCLTestImp { + public: + OCLPerfSHA256(); + virtual ~OCLPerfSHA256(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + bool setData(cl_mem buffer, unsigned int data); + void checkData(cl_mem buffer); + + cl_context context_; + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + cl_mem* inBuffer_; + cl_mem* outBuffer_; + cl_int num_input_buf_; + cl_int num_output_buf_; + cl_int error_; + + unsigned int width_; + unsigned int bufSize_; + unsigned int blockSize_; + static const unsigned int MAX_ITERATIONS = 100; + bool isAMD; +}; + +#endif // _OCL_SHA256_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMAlloc.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMAlloc.cpp new file mode 100644 index 0000000000..15746d163b --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMAlloc.cpp @@ -0,0 +1,263 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfSVMAlloc.h" + +#include +#include +#include + +#include +#include + +#include "CL/cl.h" +#include "CL/cl_ext.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 5 +#define NUM_CG_FLAGS 3 +#define NUM_FG_FLAGS 3 + +static size_t sizeList[NUM_SIZES] = { + 0x040000, 0x080000, 0x100000, 0x200000, 0x400000, +}; + +#if defined(CL_VERSION_2_0) +static const cl_svm_mem_flags CGFlags[NUM_CG_FLAGS] = { + CL_MEM_READ_WRITE, + CL_MEM_WRITE_ONLY, + CL_MEM_READ_ONLY, +}; +static const cl_svm_mem_flags FGFlags[NUM_FG_FLAGS] = { + 0, + CL_MEM_SVM_FINE_GRAIN_BUFFER, + CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, +}; +#endif + +static const char *strKernel = + "__kernel void dummy(__global uint* out) \n" + "{ \n" + " uint id = get_global_id(0); \n" + " uint value = 1; \n" + " if ((int)get_local_id(0) < 0) \n" + " out[id] = value; \n" + "} \n"; + +OCLPerfSVMAlloc::OCLPerfSVMAlloc() { + _numSubTests = NUM_CG_FLAGS * NUM_FG_FLAGS * NUM_SIZES + NUM_SIZES; + failed_ = false; + skip_ = false; +} + +OCLPerfSVMAlloc::~OCLPerfSVMAlloc() {} + +void OCLPerfSVMAlloc::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + +#if defined(CL_VERSION_2_0) + FGSystem_ = (test >= (NUM_CG_FLAGS * NUM_FG_FLAGS * NUM_SIZES)); + testFGFlag_ = (test / (NUM_SIZES * NUM_CG_FLAGS)) % NUM_FG_FLAGS; + testCGFlag_ = (test / NUM_SIZES) % NUM_CG_FLAGS; + testSize_ = test % NUM_SIZES; + + cl_device_svm_capabilities caps; + error_ = clGetDeviceInfo(devices_[deviceId], CL_DEVICE_SVM_CAPABILITIES, + sizeof(cl_device_svm_capabilities), &caps, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if ((caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) == 0) { + skip_ = true; // Should never happen as OCL 2.0 devices are required to + // support coarse grain SVM + testDescString = "Coarse Grain Buffer NOT supported. Test Skipped."; + return; + } else if (testFGFlag_ > 0 && (caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) == 0) { + skip_ = true; // No support for fine grain buffer SVM + testDescString = "Fine Grain Buffer NOT supported. Test Skipped."; + return; + } else if (FGSystem_ && (caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) == 0) { + skip_ = true; // No support for fine grain system SVM + testDescString = "Fine Grain System NOT supported. Test Skipped."; + return; + } else if (testFGFlag_ == 2 && (caps & CL_DEVICE_SVM_ATOMICS) == 0) { + skip_ = true; // No support for fine grain system SVM + testDescString = "SVM Atomic NOT supported. Test Skipped."; + return; + } + + cl_device_type deviceType; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE, + sizeof(deviceType), &deviceType, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed"); + + if (!(deviceType & CL_DEVICE_TYPE_GPU)) { + printf("GPU device is required for this test!\n"); + failed_ = true; + return; + } + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + kernel_ = _wrapper->clCreateKernel(program_, "dummy", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + return; +#else + skip_ = true; + testDescString = "SVM NOT supported for < 2.0 builds. Test Skipped."; + return; +#endif +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfSVMAlloc::run(void) { + if (skip_) { + return; + } + + if (failed_) { + return; + } +#if defined(CL_VERSION_2_0) + cl_uint *buffer = NULL; + CPerfCounter timer; + void *hostPtr = NULL; + + size_t bufSize = sizeList[testSize_] * sizeof(cl_int4); + size_t iter = 100; + + cl_mem_flags flags = CGFlags[testCGFlag_] | FGFlags[testFGFlag_]; + + timer.Reset(); + timer.Start(); + + size_t gws[1] = {bufSize / sizeof(cl_int4)}; + size_t lws[1] = {64}; + + for (size_t i = 0; i < iter; ++i) { + if (!FGSystem_) { + buffer = (cl_uint *)clSVMAlloc(context_, flags, bufSize, 0); + } else { + buffer = (cl_uint *)malloc(bufSize); + } + CHECK_RESULT(buffer == 0, "Allocation failed"); + + error_ = _wrapper->clSetKernelArgSVMPointer(kernel_, 0, buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + _wrapper->clFinish(cmdQueues_[_deviceId]); + + if (!FGSystem_) { + clSVMFree(context_, (void *)buffer); + } else { + free(buffer); + } + } + + timer.Stop(); + + CPerfCounter timer2; + timer2.Reset(); + size_t numN = 100; + + if (!FGSystem_) { + buffer = (cl_uint *)clSVMAlloc(context_, flags, bufSize, 0); + } else { + buffer = (cl_uint *)malloc(bufSize); + } + CHECK_RESULT(buffer == 0, "Allocation failed"); + + timer2.Start(); + for (size_t i = 0; i < numN; ++i) { + error_ = _wrapper->clSetKernelArgSVMPointer(kernel_, 0, buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + _wrapper->clFinish(cmdQueues_[_deviceId]); + timer2.Stop(); + + if (!FGSystem_) { + clSVMFree(context_, (void *)buffer); + } else { + free(buffer); + } + + char pFlags[5]; + pFlags[0] = + (testCGFlag_ == 0 || testCGFlag_ == 2) ? 'R' : '_'; // CL_MEM_READ_ONLY + pFlags[1] = + (testCGFlag_ == 0 || testCGFlag_ == 1) ? 'W' : '_'; // CL_MEM_WRITE_ONLY + pFlags[2] = (testFGFlag_ == 1 || testFGFlag_ == 2) + ? 'F' + : '_'; // CL_MEM_SVM_FINE_GRAIN_BUFFER + pFlags[3] = (testFGFlag_ == 2) ? 'A' : '_'; // CL_MEM_SVM_ATOMICS + + char buf[256]; + + if (!FGSystem_ && (testFGFlag_ == 0)) { + SNPRINTF(buf, sizeof(buf), + "Coarse Grain Buffer Alloc + Free (GB/s) for %6d KB, flags=%4s", + (int)bufSize / 1024, pFlags); + } else if (!FGSystem_ && (testFGFlag_ > 0)) { + SNPRINTF(buf, sizeof(buf), + "Fine Grain Buffer Alloc + Free (GB/s) for %6d KB, flags=%4s", + (int)bufSize / 1024, pFlags); + } else if (FGSystem_) { + SNPRINTF(buf, sizeof(buf), + "Fine Grain System Alloc + Free (GB/s) for %6d KB, flags=N/A ", + (int)bufSize / 1024); + } + + testDescString = buf; + double sec1 = timer.GetElapsedTime(); + double sec2 = timer2.GetElapsedTime(); + _perfInfo = static_cast((bufSize * (double)(1e-09)) / + (sec1 / iter - sec2 / numN)); +#endif +} + +unsigned int OCLPerfSVMAlloc::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMAlloc.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMAlloc.h new file mode 100644 index 0000000000..4a4818a3c6 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMAlloc.h @@ -0,0 +1,46 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PERF_SVM_ALLOC_H_ +#define _OCL_PERF_SVM_ALLOC_H_ + +#include "OCLTestImp.h" + +class OCLPerfSVMAlloc : public OCLTestImp { + public: + OCLPerfSVMAlloc(); + virtual ~OCLPerfSVMAlloc(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; + unsigned int testSize_; + bool FGSystem_; + unsigned int testCGFlag_; + unsigned int testFGFlag_; + bool skip_; +}; + +#endif // _OCL_PERF_SVM_ALLOC_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMKernelArguments.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMKernelArguments.cpp new file mode 100644 index 0000000000..5c9be9f3e4 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMKernelArguments.cpp @@ -0,0 +1,255 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfSVMKernelArguments.h" + +#include +#include +#include + +#include +#include + +#include "CL/cl.h" +#include "CL/cl_ext.h" + +static const size_t BufSize = 0x1000; +static const size_t Iterations = 0x10000; +static const size_t TotalQueues = 4; +static const size_t TotalBufs = 4; +static const size_t TotalArgs = 4; + +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +static const char *Arguments[TotalArgs] = { + "__global uint* out", + "__global uint* out, __global uint* buf0, __global uint* buf1, __global " + "uint* buf2, __global uint* buf3", + "__global uint* out, __global uint* buf0, __global uint* buf1, __global " + "uint* buf2, __global uint* buf3, \n" + "__global uint* buf4, __global uint* buf5, __global uint* buf6, __global " + "uint* buf7, __global uint* buf8", + "__global uint* out, __global uint* buf0, __global uint* buf1, __global " + "uint* buf2, __global uint* buf3,\n" + "__global uint* buf4, __global uint* buf5, __global uint* buf6, __global " + "uint* buf7, __global uint* buf8,\n" + "__global uint* buf9, __global uint* buf10, __global uint* buf11, __global " + "uint* buf12, __global uint* buf13,\n" + "__global uint* buf14, __global uint* buf15, __global uint* buf16, " + "__global uint* buf17, __global uint* buf18"}; + +static const char *strKernel = + "__kernel void dummy(%s) \n" + "{ \n" + " uint id = get_global_id(0); \n" + " uint value = 1; \n" + " out[id] = value; \n" + "} \n"; + +OCLPerfSVMKernelArguments::OCLPerfSVMKernelArguments() { + _numSubTests = TotalQueues * TotalArgs; // * TotalBufs; + failed_ = false; + skip_ = false; +} + +OCLPerfSVMKernelArguments::~OCLPerfSVMKernelArguments() {} + +void OCLPerfSVMKernelArguments::open(unsigned int test, char *units, + double &conversion, + unsigned int deviceId) { +#if defined(CL_VERSION_2_0) + // cl_mem buffer; + _deviceId = deviceId; + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + test_ = test; + cl_device_type deviceType; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE, + sizeof(deviceType), &deviceType, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed"); + + cl_device_svm_capabilities caps; + error_ = clGetDeviceInfo(devices_[deviceId], CL_DEVICE_SVM_CAPABILITIES, + sizeof(cl_device_svm_capabilities), &caps, NULL); + // check if CL_DEVICE_SVM_COARSE_GRAIN_BUFFER is set. Skip the test if not. + if (!(caps & 0x1)) { + skip_ = true; + testDescString = "SVM NOT supported. Test Skipped."; + return; + } + + if (!(deviceType & CL_DEVICE_TYPE_GPU)) { + printf("GPU device is required for this test!\n"); + failed_ = true; + return; + } + + size_t numArguments = (test_ / TotalQueues) % TotalArgs; + char *program = new char[4096]; + SNPRINTF(program, sizeof(char) * 4096, strKernel, Arguments[numArguments]); + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&program, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + kernel_ = _wrapper->clCreateKernel(program_, "dummy", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + delete[] program; + + static const size_t NumBuffs[TotalBufs] = {0x20, 0x100, 0x800, 0x2000}; + + size_t bufSize = BufSize * sizeof(cl_int); + + numBufs_ = (unsigned int)NumBuffs[test_ / (TotalQueues * TotalArgs)]; + inOutBuffer = (void **)malloc(sizeof(void *) * numBufs_); + + for (size_t b = 0; b < numBufs_; ++b) { + inOutBuffer[b] = clSVMAlloc(context_, CL_MEM_READ_WRITE, bufSize, 0); + CHECK_RESULT((error_ != CL_SUCCESS), "clSVMAlloc() failed"); + } +#else + skip_ = true; + testDescString = "SVM NOT supported for < 2.0 builds. Test Skipped."; + return; +#endif +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfSVMKernelArguments::run(void) { + if (skip_) { + return; + } + + if (failed_) { + return; + } +#if defined(CL_VERSION_2_0) + CPerfCounter timer; + static const size_t Queues[] = {1, 2, 4, 8}; + size_t numQueues = Queues[test_ % TotalQueues]; + cl_uint numArguments; + _wrapper->clGetKernelInfo(kernel_, CL_KERNEL_NUM_ARGS, sizeof(cl_uint), + &numArguments, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clGetKernelInfo() failed"); + + size_t iter = Iterations / numQueues / numBufs_; + iter = (iter == 0) ? 1 : iter; + + std::vector cmdQueues(numQueues); + for (size_t q = 0; q < numQueues; ++q) { + cl_command_queue cmdQueue = _wrapper->clCreateCommandQueue( + context_, devices_[_deviceId], 0, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed"); + cmdQueues[q] = cmdQueue; + } + // Warm-up + for (size_t b = 0; b < (numBufs_ / numArguments); ++b) { + for (size_t q = 0; q < numQueues; ++q) { + for (cl_uint a = 0; a < numArguments; ++a) { + void *buffer = inOutBuffer[(b * numArguments + a) % numBufs_]; + error_ = _wrapper->clSetKernelArgSVMPointer(kernel_, a, buffer); + CHECK_RESULT((error_ != CL_SUCCESS), + "clSetKernelArgSVMPointer() failed"); + } + + size_t gws[1] = {256}; + size_t lws[1] = {256}; + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL, + gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + } + for (size_t q = 0; q < numQueues; ++q) { + _wrapper->clFinish(cmdQueues[q]); + } + + size_t disp = 0; + timer.Reset(); + timer.Start(); + + for (size_t i = 0; i < iter; ++i) { + for (size_t b = 0; b < numBufs_; ++b) { + for (size_t q = 0; q < numQueues; ++q) { + for (cl_uint a = 0; a < numArguments; ++a) { + void *buffer = inOutBuffer[(b * numArguments + a) % numBufs_]; + error_ = _wrapper->clSetKernelArgSVMPointer(kernel_, a, buffer); + CHECK_RESULT((error_ != CL_SUCCESS), + "clSetKernelArgSVMPointer() failed"); + } + + size_t gws[1] = {256}; + size_t lws[1] = {256}; + error_ = _wrapper->clEnqueueNDRangeKernel( + cmdQueues[q], kernel_, 1, NULL, gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + disp++; + } + } + } + for (size_t q = 0; q < numQueues; ++q) { + _wrapper->clFinish(cmdQueues[q]); + } + timer.Stop(); + + for (size_t q = 0; q < numQueues; ++q) { + error_ = _wrapper->clReleaseCommandQueue(cmdQueues[q]); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), + "clReleaseCommandQueue() failed"); + } + + std::stringstream stream; + stream << "Setup time (us) for " << numQueues << " queues, "; + stream.flags(std::ios::right | std::ios::showbase); + stream.width(2); + stream << numArguments; + stream << " arguments, "; + stream.flags(std::ios::right | std::ios::showbase); + stream.width(4); + stream << numBufs_ << " buffers"; + testDescString = stream.str(); + _perfInfo = static_cast(timer.GetElapsedTime() * 1000000 / disp); +#endif +} + +unsigned int OCLPerfSVMKernelArguments::close(void) { +#if defined(CL_VERSION_2_0) + for (size_t b = 0; b < numBufs_; ++b) { + _wrapper->clSVMFree(context_, inOutBuffer[b]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clSVMFree() failed"); + } +#endif + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMKernelArguments.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMKernelArguments.h new file mode 100644 index 0000000000..4b08fde849 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMKernelArguments.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PERF_SVM_KERNEL_ARGUMENTS_H_ +#define _OCL_PERF_SVM_KERNEL_ARGUMENTS_H_ + +#include + +#include "OCLTestImp.h" + +class OCLPerfSVMKernelArguments : public OCLTestImp { + public: + OCLPerfSVMKernelArguments(); + virtual ~OCLPerfSVMKernelArguments(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; + unsigned int test_; + bool skip_; + void** inOutBuffer; + unsigned int numBufs_; +}; + +#endif // _OCL_PERF_SVM_KERNEL_ARGUMENTS_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMap.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMap.cpp new file mode 100644 index 0000000000..e0a7aef3c3 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMap.cpp @@ -0,0 +1,153 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfSVMMap.h" + +#include +#include +#include + +#include +#include + +#include "CL/cl.h" +#include "CL/cl_ext.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 5 +static size_t sizeList[] = { + 0x040000, 0x080000, 0x100000, 0x200000, 0x400000, +}; + +#define NUM_FLAGS 4 +static const cl_map_flags Flags[NUM_FLAGS] = {CL_MAP_READ, CL_MAP_WRITE, + CL_MAP_READ | CL_MAP_WRITE, + CL_MAP_WRITE_INVALIDATE_REGION}; + +OCLPerfSVMMap::OCLPerfSVMMap() { + _numSubTests = NUM_SIZES * NUM_FLAGS; + failed_ = false; + skip_ = false; +} + +OCLPerfSVMMap::~OCLPerfSVMMap() {} + +void OCLPerfSVMMap::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { +#if defined(CL_VERSION_2_0) + _deviceId = deviceId; + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + testFlag_ = test / NUM_SIZES; + testSize_ = test % NUM_SIZES; + + cl_device_type deviceType; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE, + sizeof(deviceType), &deviceType, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed"); + + cl_device_svm_capabilities caps; + error_ = clGetDeviceInfo(devices_[deviceId], CL_DEVICE_SVM_CAPABILITIES, + sizeof(cl_device_svm_capabilities), &caps, NULL); + // check if CL_DEVICE_SVM_COARSE_GRAIN_BUFFER is set. Skip the test if not. + if (!(caps & 0x1)) { + skip_ = true; + testDescString = "SVM NOT supported. Test Skipped."; + return; + } + + if (!(deviceType & CL_DEVICE_TYPE_GPU)) { + printf("GPU device is required for this test!\n"); + failed_ = true; + return; + } +#else + skip_ = true; + testDescString = "SVM NOT supported for < 2.0 builds. Test Skipped."; + return; +#endif +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfSVMMap::run(void) { + if (skip_) { + return; + } + + if (failed_) { + return; + } +#if defined(CL_VERSION_2_0) + void *buffer; + CPerfCounter timer; + void *hostPtr = NULL; + + const size_t bufSize = sizeList[testSize_] * sizeof(cl_int4); + const cl_map_flags flag = Flags[testFlag_]; + const size_t iter = 100; + + timer.Reset(); + + buffer = clSVMAlloc(context_, CL_MEM_READ_WRITE, bufSize, 0); + CHECK_RESULT((error_ != CL_SUCCESS), "clSVMAlloc() failed"); + + for (size_t i = 0; i < iter; ++i) { + timer.Start(); + + error_ = clEnqueueSVMMap(cmdQueues_[_deviceId], CL_FALSE, flag, buffer, + bufSize, 0, 0, 0); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueSVMMap() failed"); + + error_ = clEnqueueSVMUnmap(cmdQueues_[_deviceId], buffer, 0, 0, 0); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueSVMUnmap() failed"); + + _wrapper->clFinish(cmdQueues_[_deviceId]); + + timer.Stop(); + } + + clSVMFree(context_, (void *)buffer); + + char pFlags[4]; + pFlags[0] = (testFlag_ == 0 || testFlag_ == 2) ? 'R' : '_'; // CL_MAP_READ + pFlags[1] = (testFlag_ == 1 || testFlag_ == 2) ? 'W' : '_'; // CL_MAP_WRITE + pFlags[2] = (testFlag_ == 3) ? 'I' : '_'; // CL_MAP_WRITE_INVALIDATE_REGION + + char buf[256]; + SNPRINTF(buf, sizeof(buf), "Map + Unmap (GB/s) for %6d KB, flags=%3s", + (int)bufSize / 1024, pFlags); + + testDescString = buf; + double sec = timer.GetElapsedTime(); + _perfInfo = static_cast((bufSize * iter * (double)(1e-09)) / sec); +#endif +} + +unsigned int OCLPerfSVMMap::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMap.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMap.h new file mode 100644 index 0000000000..eedc6b7d2a --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMap.h @@ -0,0 +1,44 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PERF_SVM_MAP_H_ +#define _OCL_PERF_SVM_MAP_H_ + +#include "OCLTestImp.h" + +class OCLPerfSVMMap : public OCLTestImp { + public: + OCLPerfSVMMap(); + virtual ~OCLPerfSVMMap(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; + unsigned int testSize_; + unsigned int testFlag_; + bool skip_; +}; + +#endif // _OCL_PERF_SVM_MAP_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemFill.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemFill.cpp new file mode 100644 index 0000000000..24c45a6b2a --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemFill.cpp @@ -0,0 +1,214 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfSVMMemFill.h" + +#include +#include +#include + +#include +#include + +#include "CL/cl.h" +#include "CL/cl_ext.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_MODES 3 +#define NUM_CG_FLAGS 2 +#define NUM_FG_FLAGS 3 + +static size_t typeSizeList[] = { + 1, // sizeof(cl_uchar) + 2, 4, 8, 16, 32, 64, + 128, // sizeof(cl_ulong16) +}; + +static unsigned int eleNumList[] = { + 0x0020000, 0x0080000, 0x0200000, 0x0800000, 0x2000000, +}; + +#if defined(CL_VERSION_2_0) +static const cl_svm_mem_flags CGFlags[NUM_CG_FLAGS] = { + CL_MEM_READ_WRITE, + CL_MEM_WRITE_ONLY, +}; +static const cl_svm_mem_flags FGFlags[NUM_FG_FLAGS] = { + 0, + CL_MEM_SVM_FINE_GRAIN_BUFFER, + CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, +}; +#endif + +OCLPerfSVMMemFill::OCLPerfSVMMemFill() { + num_typeSize_ = sizeof(typeSizeList) / sizeof(size_t); + num_elements_ = sizeof(eleNumList) / sizeof(unsigned int); + _numSubTests = + num_elements_ * num_typeSize_ * (NUM_FG_FLAGS * NUM_CG_FLAGS + 1); + failed_ = false; + skip_ = false; +} + +OCLPerfSVMMemFill::~OCLPerfSVMMemFill() {} + +void OCLPerfSVMMemFill::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + +#if defined(CL_VERSION_2_0) + FGSystem_ = + (test >= (num_elements_ * num_typeSize_ * NUM_FG_FLAGS * NUM_CG_FLAGS)); + testFGFlag_ = + (test / (num_elements_ * num_typeSize_ * NUM_CG_FLAGS)) % NUM_FG_FLAGS; + testCGFlag_ = (test / (num_elements_ * num_typeSize_)) % NUM_CG_FLAGS; + testTypeSize_ = typeSizeList[(test / num_elements_) % num_typeSize_]; + testNumEle_ = eleNumList[test % num_elements_]; + + cl_device_svm_capabilities caps; + error_ = clGetDeviceInfo(devices_[deviceId], CL_DEVICE_SVM_CAPABILITIES, + sizeof(cl_device_svm_capabilities), &caps, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if ((caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) == 0) { + skip_ = true; // Should never happen as OCL 2.0 devices are required to + // support coarse grain SVM + testDescString = "Coarse Grain Buffer NOT supported. Test Skipped."; + return; + } else if (testFGFlag_ > 0 && (caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) == 0) { + skip_ = true; // No support for fine grain buffer SVM + testDescString = "Fine Grain Buffer NOT supported. Test Skipped."; + return; + } else if (FGSystem_ && (caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) == 0) { + skip_ = true; // No support for fine grain system SVM + testDescString = "Fine Grain System NOT supported. Test Skipped."; + return; + } else if (testFGFlag_ == 2 && ((caps & CL_DEVICE_SVM_ATOMICS) == 0)) { + skip_ = true; // No support for SVM Atomic + testDescString = "SVM Atomic NOT supported. Test Skipped."; + return; + } + + cl_device_type deviceType; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE, + sizeof(deviceType), &deviceType, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed"); + + if (!(deviceType & CL_DEVICE_TYPE_GPU)) { + printf("GPU device is required for this test!\n"); + failed_ = true; + return; + } + return; +#else + skip_ = true; + testDescString = "SVM NOT supported for < 2.0 builds. Test Skipped."; + return; +#endif +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfSVMMemFill::run(void) { + if (skip_) { + return; + } + + if (failed_) { + return; + } +#if defined(CL_VERSION_2_0) + cl_uint *buffer = NULL; + CPerfCounter timer; + size_t iter = 100, bufSize = testNumEle_ * 4; + + cl_mem_flags flags = CGFlags[testCGFlag_] | FGFlags[testFGFlag_]; + + void *data = malloc(bufSize); + + timer.Reset(); + + if (!FGSystem_) { + buffer = + (cl_uint *)clSVMAlloc(context_, flags, bufSize, (cl_uint)testTypeSize_); + CHECK_RESULT(buffer == 0, "Allocation failed"); + } else { // FGSystem_ = true + buffer = (cl_uint *)malloc(bufSize); + CHECK_RESULT(buffer == 0, "Allocation failed"); + } + + timer.Start(); + for (size_t i = 0; i < iter; ++i) { + error_ = clEnqueueSVMMemFill(cmdQueues_[_deviceId], buffer, data, + testTypeSize_, bufSize, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueSVMMemFill() failed"); + } + _wrapper->clFinish(cmdQueues_[_deviceId]); + timer.Stop(); + + if (!FGSystem_) { + clSVMFree(context_, (void *)buffer); + } else { + free(buffer); + } + + char pFlags[5]; + pFlags[0] = + (testCGFlag_ == 0 || testCGFlag_ == 2) ? 'R' : '_'; // CL_MEM_READ_ONLY + pFlags[1] = + (testCGFlag_ == 0 || testCGFlag_ == 1) ? 'W' : '_'; // CL_MEM_WRITE_ONLY + pFlags[2] = (testFGFlag_ == 1 || testFGFlag_ == 2) + ? 'F' + : '_'; // CL_MEM_SVM_FINE_GRAIN_BUFFER + pFlags[3] = (testFGFlag_ == 2) ? 'A' : '_'; // CL_MEM_SVM_ATOMICS + + char buf[256]; + + if (!FGSystem_ && (testFGFlag_ == 0)) { + SNPRINTF(buf, sizeof(buf), + "Coarse Grain Buffer SVMMemFill (GB/s) for %6d KB, typeSize:%3d, " + "flags=%4s", + (int)bufSize / 1024, (int)testTypeSize_, pFlags); + } else if (!FGSystem_ && (testFGFlag_ > 0)) { + SNPRINTF(buf, sizeof(buf), + "Fine Grain Buffer SVMMemFill (GB/s) for %6d KB, typeSize:%3d, " + "flags=%4s", + (int)bufSize / 1024, (int)testTypeSize_, pFlags); + } else if (FGSystem_) { + SNPRINTF(buf, sizeof(buf), + "Fine Grain System SVMMemFill (GB/s) for %6d KB, typeSize:%3d, " + "flags=%4s", + (int)bufSize / 1024, (int)testTypeSize_, pFlags); + } + + testDescString = buf; + double sec = timer.GetElapsedTime(); + _perfInfo = static_cast((bufSize * iter * (double)(1e-09)) / sec); +#endif +} + +unsigned int OCLPerfSVMMemFill::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemFill.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemFill.h new file mode 100644 index 0000000000..37ceed8d49 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemFill.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PERF_SVM_MEMFILL_H_ +#define _OCL_PERF_SVM_MEMFILL_H_ + +#include "OCLTestImp.h" + +class OCLPerfSVMMemFill : public OCLTestImp { + public: + OCLPerfSVMMemFill(); + virtual ~OCLPerfSVMMemFill(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + unsigned int num_typeSize_; + unsigned int num_elements_; + bool FGSystem_; + size_t testTypeSize_; + unsigned int testCGFlag_; + unsigned int testFGFlag_; + unsigned int testNumEle_; + bool atomic_; + bool failed_; + bool skip_; +}; + +#endif // _OCL_PERF_SVM_MEMFILL_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemcpy.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemcpy.cpp new file mode 100644 index 0000000000..33f0c05146 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemcpy.cpp @@ -0,0 +1,216 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfSVMMemcpy.h" + +#include +#include +#include + +#include +#include + +#include "CL/cl.h" +#include "CL/cl_ext.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 5 +#define NUM_SRC_FLAGS 2 +#define NUM_DST_FLAGS 2 +#define NUM_FG_FLAGS 3 + +static size_t sizeList[NUM_SIZES] = { + 0x040000, 0x080000, 0x100000, 0x200000, 0x400000, +}; + +#if defined(CL_VERSION_2_0) +static const cl_svm_mem_flags srcFlagList[NUM_SRC_FLAGS] = {CL_MEM_READ_WRITE, + CL_MEM_READ_ONLY}; +static const cl_svm_mem_flags dstFlagList[NUM_DST_FLAGS] = {CL_MEM_READ_WRITE, + CL_MEM_WRITE_ONLY}; +static const cl_svm_mem_flags FGFlags[NUM_FG_FLAGS] = { + 0, + CL_MEM_SVM_FINE_GRAIN_BUFFER, + CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, +}; +#endif + +OCLPerfSVMMemcpy::OCLPerfSVMMemcpy() { + _numSubTests = (NUM_SRC_FLAGS * NUM_DST_FLAGS * NUM_FG_FLAGS + 1) * NUM_SIZES; + failed_ = false; + skip_ = false; +} + +OCLPerfSVMMemcpy::~OCLPerfSVMMemcpy() {} + +void OCLPerfSVMMemcpy::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + +#if defined(CL_VERSION_2_0) + FGSystem_ = + (test >= (NUM_SIZES * NUM_SRC_FLAGS * NUM_DST_FLAGS * NUM_FG_FLAGS)); + testFGFlag_ = + (test / (NUM_SIZES * NUM_DST_FLAGS * NUM_SRC_FLAGS)) % (NUM_FG_FLAGS); + testSrcFlag_ = (test / (NUM_SIZES * NUM_DST_FLAGS)) % (NUM_SRC_FLAGS); + testDstFlag_ = (test / NUM_SIZES) % (NUM_DST_FLAGS); + testSize_ = test % NUM_SIZES; + + cl_device_svm_capabilities caps; + error_ = clGetDeviceInfo(devices_[deviceId], CL_DEVICE_SVM_CAPABILITIES, + sizeof(cl_device_svm_capabilities), &caps, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if ((caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) == 0) { + skip_ = true; // Should never happen as OCL 2.0 devices are required to + // support coarse grain SVM + testDescString = "Coarse Grain Buffer NOT supported. Test Skipped."; + return; + } else if ((testFGFlag_ > 0) && + (caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) == 0) { + skip_ = true; // No support for fine grain buffer SVM + testDescString = "Fine Grain Buffer NOT supported. Test Skipped."; + return; + } else if (FGSystem_ && (caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) == 0) { + skip_ = true; // No support for fine grain system SVM + testDescString = "Fine Grain System NOT supported. Test Skipped."; + return; + } else if ((testFGFlag_ == 2) && ((caps & CL_DEVICE_SVM_ATOMICS) == 0)) { + skip_ = true; // No support for SVM Atomic + testDescString = "SVM Atomic NOT supported. Test Skipped."; + return; + } + + cl_device_type deviceType; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE, + sizeof(deviceType), &deviceType, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed"); + + if (!(deviceType & CL_DEVICE_TYPE_GPU)) { + printf("GPU device is required for this test!\n"); + failed_ = true; + return; + } + return; +#else + skip_ = true; + testDescString = "SVM NOT supported for < 2.0 builds. Test Skipped."; + return; +#endif +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfSVMMemcpy::run(void) { + if (skip_) { + return; + } + + if (failed_) { + return; + } +#if defined(CL_VERSION_2_0) + cl_uint *src = NULL, *dst = NULL; + CPerfCounter timer; + + size_t bufSize = sizeList[testSize_] * sizeof(cl_int4); + size_t iter = 100; + + cl_mem_flags srcFlags = srcFlagList[testSrcFlag_] | FGFlags[testFGFlag_]; + cl_mem_flags dstFlags = dstFlagList[testDstFlag_] | FGFlags[testFGFlag_]; + + size_t gws[1] = {bufSize / sizeof(cl_int4)}; + size_t lws[1] = {64}; + + if (!FGSystem_) { + src = (cl_uint *)clSVMAlloc(context_, srcFlags, bufSize, 0); + CHECK_RESULT(src == 0, "Allocation failed"); + dst = (cl_uint *)clSVMAlloc(context_, dstFlags, bufSize, 0); + CHECK_RESULT(dst == 0, "Allocation failed"); + } else { // FGSystem_ == true + src = (cl_uint *)malloc(bufSize); + dst = (cl_uint *)malloc(bufSize); + } + + timer.Reset(); + timer.Start(); + for (size_t i = 0; i < iter; ++i) { + clEnqueueSVMMemcpy(cmdQueues_[_deviceId], false, dst, src, bufSize, 0, NULL, + NULL); + } + _wrapper->clFinish(cmdQueues_[_deviceId]); + timer.Stop(); + + if (!FGSystem_) { + clSVMFree(context_, (void *)src); + clSVMFree(context_, (void *)dst); + } else { // FGSystem_ = true + free(src); + free(dst); + } + + char pSrcFlags[5]; + pSrcFlags[0] = + (testSrcFlag_ == 0 || testSrcFlag_ == 1) ? 'R' : '_'; // CL_MEM_READ_ONLY + pSrcFlags[1] = (testSrcFlag_ == 0) ? 'W' : '_'; // CL_MEM_WRITE_ONLY + pSrcFlags[2] = (testFGFlag_ == 1 || testFGFlag_ == 2) + ? 'F' + : '_'; // CL_MEM_SVM_FINE_GRAIN_BUFFER + pSrcFlags[3] = (testFGFlag_ == 2) ? 'A' : '_'; // CL_MEM_SVM_ATOMICS + pSrcFlags[4] = '\0'; + + char pDstFlags[5]; + pDstFlags[0] = (testDstFlag_ == 0) ? 'R' : '_'; + pDstFlags[1] = (testDstFlag_ == 0 || testDstFlag_ == 1) ? 'W' : '_'; + pDstFlags[2] = (testFGFlag_ == 1 || testFGFlag_ == 2) ? 'F' : '_'; + pDstFlags[3] = (testFGFlag_ == 2) ? 'A' : '_'; + pSrcFlags[4] = '\0'; + + char buf[256]; + + if (FGSystem_) { + SNPRINTF(buf, sizeof(buf), + "Fine Grain System SVMMemcpy (GB/s) for %6d KB, from:%4s to:%4s", + (int)bufSize / 1024, pSrcFlags, pDstFlags); + } else if (testFGFlag_ == 0) { + SNPRINTF(buf, sizeof(buf), + "Coarse Grain Buffer SVMMemcpy (GB/s) for %6d KB, from:%4s to:%4s", + (int)bufSize / 1024, pSrcFlags, pDstFlags); + } else { + SNPRINTF(buf, sizeof(buf), + "Fine Grain Buffer SVMMemcpy (GB/s) for %6d KB, from:%4s to:%4s", + (int)bufSize / 1024, pSrcFlags, pDstFlags); + } + + testDescString = buf; + double sec = timer.GetElapsedTime(); + _perfInfo = static_cast((bufSize * iter * (double)(1e-09)) / sec); +#endif +} + +unsigned int OCLPerfSVMMemcpy::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemcpy.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemcpy.h new file mode 100644 index 0000000000..32fe4fb49e --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMMemcpy.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PERF_SVM_MEMCPY_H_ +#define _OCL_PERF_SVM_MEMCPY_H_ + +#include "OCLTestImp.h" + +class OCLPerfSVMMemcpy : public OCLTestImp { + public: + OCLPerfSVMMemcpy(); + virtual ~OCLPerfSVMMemcpy(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; + unsigned int testSize_; + unsigned int testSrcFlag_; + unsigned int testDstFlag_; + unsigned int testFGFlag_; + bool FGSystem_; + bool skip_; +}; + +#endif // _OCL_PERF_SVM_MEMCPY_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMSampleRate.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMSampleRate.cpp new file mode 100644 index 0000000000..f13e4cc410 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMSampleRate.cpp @@ -0,0 +1,359 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfSVMSampleRate.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_TYPES 3 +static const char *types[NUM_TYPES] = {"float", "float2", "float4"}; +static const unsigned int typeSizes[NUM_TYPES] = {4, 8, 16}; + +#define NUM_SIZES 12 +static const unsigned int sizes[NUM_SIZES] = {1, 2, 4, 8, 16, 32, + 64, 128, 256, 512, 1024, 2048}; + +#define NUM_BUFS 6 +#define MAX_BUFS (1 << (NUM_BUFS - 1)) + +#define NUM_READS numBufs_ + +OCLPerfSVMSampleRate::OCLPerfSVMSampleRate() { + _numSubTests = NUM_TYPES * NUM_SIZES * NUM_BUFS * 3; + skip_ = false; +} + +OCLPerfSVMSampleRate::~OCLPerfSVMSampleRate() {} + +void OCLPerfSVMSampleRate::setKernel(void) { + shader_.clear(); + shader_ += + "kernel void sampleRate(global DATATYPE* outBuffer, unsigned int " + "inBufSize, unsigned int writeIt,\n"; + char buf[256]; + for (unsigned int i = 0; i < numBufs_; i++) { + SNPRINTF(buf, sizeof(buf), "global DATATYPE* inBuffer%d", i); + shader_ += buf; + if (i < (numBufs_ - 1)) { + shader_ += ","; + } + shader_ += "\n"; + } + shader_ += ")\n"; + shader_ += + "{\n" + " uint gid = get_global_id(0);\n" + " uint inputIdx = gid % inBufSize;\n" + " DATATYPE tmp = (DATATYPE)0.0f;\n"; + + for (unsigned int j = 0; j < (NUM_READS / numBufs_); j++) { + for (unsigned int i = 0; i < numBufs_; i++) { + SNPRINTF(buf, sizeof(buf), " tmp += inBuffer%d[inputIdx];\n", i); + shader_ += buf; + } + shader_ += " inputIdx += writeIt;\n"; // writeIt is 0, so we don't need + // to add a modulo + } + if (typeSizes[typeIdx_] > 4) { + shader_ += + " if (writeIt*(unsigned int)tmp.x) outBuffer[gid] = tmp;\n" + "}\n"; + } else { + shader_ += + " if (writeIt*(unsigned int)tmp) outBuffer[gid] = tmp;\n" + "}\n"; + } + // printf("Shader -> %s\n", shader_.c_str()); +} + +void OCLPerfSVMSampleRate::setData(void *buffer, unsigned int val) { +#if defined(CL_VERSION_2_0) + error_ = _wrapper->clEnqueueSVMMemFill( + cmd_queue_, buffer, &val, sizeof(unsigned int), bufSize_, 0, NULL, NULL); + if ((error_ == CL_MEM_OBJECT_ALLOCATION_FAILURE) || + (error_ == CL_OUT_OF_RESOURCES) || (error_ == CL_OUT_OF_HOST_MEMORY)) { + error_ = CL_SUCCESS; + skip_ = true; + testDescString = "Not enough memory, skipped"; + return; + } + _wrapper->clFinish(cmd_queue_); +#endif +} + +void OCLPerfSVMSampleRate::checkData(void *buffer) { +#if defined(CL_VERSION_2_0) + error_ = _wrapper->clEnqueueSVMMap(cmd_queue_, true, CL_MAP_READ, buffer, + outBufSize_, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueSVMMap failed"); + float *data = (float *)buffer; + for (unsigned int i = 0; i < outBufSize_ / sizeof(float); i++) { + if (data[i] != (float)numBufs_) { + printf("Data validation failed at %d! Got %f, expected %f\n", i, data[i], + (float)numBufs_); + break; + } + } + error_ = _wrapper->clEnqueueSVMUnmap(cmd_queue_, buffer, 0, NULL, NULL); + _wrapper->clFinish(cmd_queue_); +#endif +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfSVMSampleRate::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_device_id device; + error_ = CL_SUCCESS; + + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + program_ = 0; + kernel_ = 0; + cmd_queue_ = 0; + inBuffer_ = NULL; + outBuffer_ = NULL; + coarseGrainBuffer_ = false; + fineGrainBuffer_ = false; + fineGrainSystem_ = false; + + // We compute a square domain + width_ = sizes[test % NUM_SIZES]; + typeIdx_ = (test / NUM_SIZES) % NUM_TYPES; + bufSize_ = width_ * width_ * typeSizes[typeIdx_]; + numBufs_ = (1 << ((test / (NUM_SIZES * NUM_TYPES)) % NUM_BUFS)); + svmMode_ = test / (NUM_SIZES * NUM_TYPES * NUM_BUFS); + + device = devices_[deviceId]; + +#if defined(CL_VERSION_2_0) + cl_device_svm_capabilities caps; + error_ = clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, + sizeof(cl_device_svm_capabilities), &caps, NULL); + if (svmMode_ == 0) { + if (caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) { + coarseGrainBuffer_ = true; + testdesc = "crs"; + } else { + skip_ = true; // Should never happen as OCL 2.0 devices are required to + // support coarse grain SVM + testDescString = "Coarse grain SVM NOT supported. Test Skipped."; + return; + } + } else if (svmMode_ == 1) { + if (caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) { + fineGrainBuffer_ = true; + testdesc = "fgb"; + } else { + skip_ = true; // No support for fine grain buffer SVM + testDescString = "Fine grain buffer SVM NOT supported. Test Skipped."; + return; + } + } else if (svmMode_ == 2) { + if (caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) { + fineGrainSystem_ = true; + testdesc = "fgs"; + } else { + skip_ = true; // No support for fine grain system SVM + testDescString = "Fine grain system SVM NOT supported. Test Skipped."; + return; + } + } + + char charbuf[1024]; + + cmd_queue_ = cmdQueues_[_deviceId]; + + outBufSize_ = + sizes[NUM_SIZES - 1] * sizes[NUM_SIZES - 1] * typeSizes[NUM_TYPES - 1]; + if ((svmMode_ == 0) || (svmMode_ == 1)) { + inBuffer_ = (void **)malloc(sizeof(void *) * numBufs_); + memset(inBuffer_, 0, sizeof(void *) * numBufs_); + cl_mem_flags flags; + flags = CL_MEM_READ_ONLY; + if (svmMode_ == 1) flags |= CL_MEM_SVM_FINE_GRAIN_BUFFER; + for (unsigned int i = 0; i < numBufs_; i++) { + inBuffer_[i] = _wrapper->clSVMAlloc(context_, flags, bufSize_, 0); + CHECK_RESULT(inBuffer_[i] == NULL, "clCreateBuffer(inBuffer) failed"); + } + + flags = CL_MEM_WRITE_ONLY; + if (svmMode_ == 1) flags |= CL_MEM_SVM_FINE_GRAIN_BUFFER; + outBuffer_ = _wrapper->clSVMAlloc(context_, flags, outBufSize_, 0); + CHECK_RESULT(outBuffer_ == NULL, "clCreateBuffer(outBuffer) failed"); + } else { + inBuffer_ = (void **)malloc(sizeof(void *) * numBufs_); + memset(inBuffer_, 0, sizeof(void *) * numBufs_); + for (unsigned int i = 0; i < numBufs_; i++) { + inBuffer_[i] = malloc(bufSize_); + CHECK_RESULT(inBuffer_[i] == NULL, "malloc(inBuffer) failed"); + } + outBuffer_ = malloc(outBufSize_); + CHECK_RESULT(outBuffer_ == NULL, "malloc(outBuffer) failed"); + } + + setKernel(); + char *tmp = (char *)shader_.c_str(); + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&tmp, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + const char *buildOps = NULL; + // Have to force OCL 2.0 to use SVM + SNPRINTF(charbuf, sizeof(charbuf), "-cl-std=CL2.0 -D DATATYPE=%s", + types[typeIdx_]); + buildOps = charbuf; + error_ = _wrapper->clBuildProgram(program_, 1, &device, buildOps, NULL, NULL); + + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "sampleRate", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + + error_ = _wrapper->clSetKernelArgSVMPointer(kernel_, 0, outBuffer_); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(outBuffer) failed"); + unsigned int sizeDW = width_ * width_; + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(unsigned int), + (void *)&sizeDW); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(sizeDW) failed"); + unsigned int writeIt = 0; + error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(unsigned int), + (void *)&writeIt); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(writeIt) failed"); + for (unsigned int i = 0; i < numBufs_; i++) { + error_ = _wrapper->clSetKernelArgSVMPointer(kernel_, i + 3, inBuffer_[i]); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(inBuffer) failed"); + setData(inBuffer_[i], 0x3f800000); + if (skip_) return; + } + setData(outBuffer_, 0xdeadbeef); +#else + skip_ = true; + testDescString = "SVM NOT supported for < 2.0 builds. Test Skipped."; + return; +#endif +} + +void OCLPerfSVMSampleRate::run(void) { + int global = outBufSize_ / typeSizes[typeIdx_]; + int local = 64; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + unsigned int maxIter = MAX_ITERATIONS * (MAX_BUFS / numBufs_); + + if (skip_) return; + + CPerfCounter timer; + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < maxIter; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + } + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + _wrapper->clFinish(cmd_queue_); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Test doesn't write anything, so nothing to check + // checkData(outBuffer_); + // Compute GB/s + double perf = + ((double)outBufSize_ * NUM_READS * (double)maxIter * (double)(1e-09)) / + sec; + char buf[256]; + SNPRINTF(buf, sizeof(buf), "Domain %dx%d, %2d %s bufs, %6s, %4dx%4d (GB/s)", + sizes[NUM_SIZES - 1], sizes[NUM_SIZES - 1], numBufs_, + testdesc.c_str(), types[typeIdx_], width_, width_); + + _perfInfo = (float)perf; + testDescString = buf; +} + +unsigned int OCLPerfSVMSampleRate::close(void) { +#if defined(CL_VERSION_2_0) + if (cmd_queue_) _wrapper->clFinish(cmd_queue_); + + if ((svmMode_ == 0) || (svmMode_ == 1)) { + if (inBuffer_) { + for (unsigned int i = 0; i < numBufs_; i++) { + if (inBuffer_[i]) { + _wrapper->clSVMFree(context_, inBuffer_[i]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clSVMFree(inBuffer_) failed"); + } + } + free(inBuffer_); + } + if (outBuffer_) { + _wrapper->clSVMFree(context_, outBuffer_); + } + } else { + if (inBuffer_) { + for (unsigned int i = 0; i < numBufs_; i++) { + if (inBuffer_[i]) { + free(inBuffer_[i]); + } + } + free(inBuffer_); + } + if (outBuffer_) { + free(outBuffer_); + } + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } +#endif + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMSampleRate.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMSampleRate.h new file mode 100644 index 0000000000..c388766cdd --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSVMSampleRate.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_SVMSAMPLERATE_H_ +#define _OCL_SVMSAMPLERATE_H_ + +#include "OCLTestImp.h" + +class OCLPerfSVMSampleRate : public OCLTestImp { + public: + OCLPerfSVMSampleRate(); + virtual ~OCLPerfSVMSampleRate(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void setData(void* buffer, unsigned int data); + void checkData(void* buffer); + void setKernel(void); + + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + void** inBuffer_; + void* outBuffer_; + + unsigned int width_; + unsigned int bufSize_; + unsigned int outBufSize_; + static const unsigned int MAX_ITERATIONS = 25; + unsigned int numBufs_; + unsigned int typeIdx_; + unsigned int svmMode_; + + bool skip_; + bool coarseGrainBuffer_; + bool fineGrainBuffer_; + bool fineGrainSystem_; + std::string testdesc; +}; + +#endif // _OCL_SVMSAMPLERATE_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSampleRate.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSampleRate.cpp new file mode 100644 index 0000000000..11ff83b692 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSampleRate.cpp @@ -0,0 +1,336 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfSampleRate.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_TYPES 3 +static const char *types[NUM_TYPES] = {"float", "float2", "float4"}; +static const unsigned int typeSizes[NUM_TYPES] = {4, 8, 16}; + +#define NUM_SIZES 12 +static const unsigned int sizes[NUM_SIZES] = {1, 2, 4, 8, 16, 32, + 64, 128, 256, 512, 1024, 2048}; + +#define NUM_BUFS 6 +#define MAX_BUFS (1 << (NUM_BUFS - 1)) + +OCLPerfSampleRate::OCLPerfSampleRate() { + _numSubTests = NUM_TYPES * NUM_SIZES * NUM_BUFS; + skip_ = false; +} + +OCLPerfSampleRate::~OCLPerfSampleRate() {} + +void OCLPerfSampleRate::setKernel(void) { + shader_.clear(); + shader_ += + "kernel void sampleRate(global DATATYPE* outBuffer, unsigned int " + "inBufSize, unsigned int writeIt,\n"; + char buf[256]; + for (unsigned int i = 0; i < numBufs_; i++) { + SNPRINTF(buf, sizeof(buf), "global DATATYPE* inBuffer%d", i); + shader_ += buf; + if (i < (numBufs_ - 1)) { + shader_ += ","; + } + shader_ += "\n"; + } + shader_ += ")\n"; + shader_ += + "{\n" + " uint gid = get_global_id(0);\n" + " uint inputIdx = gid % inBufSize;\n" + " DATATYPE tmp = (DATATYPE)0.0f;\n"; + + for (unsigned int i = 0; i < numBufs_; i++) { + SNPRINTF(buf, sizeof(buf), " tmp += inBuffer%d[inputIdx];\n", i); + shader_ += buf; + } + if (typeSizes[typeIdx_] > 4) { + shader_ += + " if (writeIt*(unsigned int)tmp.x) outBuffer[gid] = tmp;\n" + "}\n"; + } else { + shader_ += + " if (writeIt*(unsigned int)tmp) outBuffer[gid] = tmp;\n" + "}\n"; + } + // printf("Shader -> %s\n", shader_.c_str()); +} + +void OCLPerfSampleRate::setData(cl_mem buffer, unsigned int val) { + unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL, + &error_); + if (data == NULL) { + if ((error_ == CL_MEM_OBJECT_ALLOCATION_FAILURE) || + (error_ == CL_OUT_OF_RESOURCES) || (error_ == CL_OUT_OF_HOST_MEMORY)) { + printf("WARNING: Not enough memory, skipped\n"); + error_ = CL_SUCCESS; + skip_ = true; + } else { + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueMapBuffer failed"); + } + return; + } + for (unsigned int i = 0; i < bufSize_ / sizeof(unsigned int); i++) + data[i] = val; + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); +} + +void OCLPerfSampleRate::checkData(cl_mem buffer) { + float *data = (float *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, buffer, true, CL_MAP_READ, 0, outBufSize_, 0, NULL, NULL, + &error_); + for (unsigned int i = 0; i < outBufSize_ / sizeof(float); i++) { + if (data[i] != (float)numBufs_) { + printf("Data validation failed at %d! Got %f, expected %f\n", i, data[i], + (float)numBufs_); + break; + } + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfSampleRate::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + program_ = 0; + kernel_ = 0; + inBuffer_ = 0; + outBuffer_ = 0; + + // We compute a square domain + width_ = sizes[test % NUM_SIZES]; + typeIdx_ = (test / NUM_SIZES) % NUM_TYPES; + bufSize_ = width_ * width_ * typeSizes[typeIdx_]; + numBufs_ = (1 << (test / (NUM_SIZES * NUM_TYPES))); + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + platform = platforms[_platformIndex]; + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + delete platforms; + } + /* + * If we could find a platform, use it. + */ + CHECK_RESULT(platform == 0, + "Couldn't find platform with GPU devices, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + char charbuf[1024]; + size_t retsize; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024, + charbuf, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + inBuffer_ = (cl_mem *)malloc(sizeof(cl_mem) * numBufs_); + memset(inBuffer_, 0, sizeof(cl_mem) * numBufs_); + for (unsigned int i = 0; i < numBufs_; i++) { + inBuffer_[i] = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, + bufSize_, NULL, &error_); + CHECK_RESULT(inBuffer_[i] == 0, "clCreateBuffer(inBuffer) failed"); + } + + outBufSize_ = + sizes[NUM_SIZES - 1] * sizes[NUM_SIZES - 1] * typeSizes[NUM_TYPES - 1]; + outBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, + outBufSize_, NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + setKernel(); + char *tmp = (char *)shader_.c_str(); + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&tmp, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + const char *buildOps = NULL; + SNPRINTF(charbuf, sizeof(charbuf), "-D DATATYPE=%s", types[typeIdx_]); + buildOps = charbuf; + error_ = _wrapper->clBuildProgram(program_, 1, &device, buildOps, NULL, NULL); + + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "sampleRate", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer_); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(outBuffer) failed"); + unsigned int sizeDW = width_ * width_; + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(unsigned int), + (void *)&sizeDW); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(sizeDW) failed"); + unsigned int writeIt = 0; + error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(unsigned int), + (void *)&writeIt); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(writeIt) failed"); + for (unsigned int i = 0; i < numBufs_; i++) { + error_ = _wrapper->clSetKernelArg(kernel_, i + 3, sizeof(cl_mem), + (void *)&inBuffer_[i]); + CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg(inBuffer) failed"); + setData(inBuffer_[i], 0x3f800000); + if (skip_) return; + } + setData(outBuffer_, 0xdeadbeef); +} + +void OCLPerfSampleRate::run(void) { + int global = outBufSize_ / typeSizes[typeIdx_]; + int local = 64; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + unsigned int maxIter = MAX_ITERATIONS * (MAX_BUFS / numBufs_); + + if (skip_) return; + + CPerfCounter timer; + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < maxIter; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + } + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + _wrapper->clFinish(cmd_queue_); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // checkData(outBuffer_); + // Compute GB/s + double perf = + ((double)outBufSize_ * numBufs_ * (double)maxIter * (double)(1e-09)) / + sec; + char buf[256]; + SNPRINTF(buf, sizeof(buf), "Domain %dx%d, %2d bufs, %6s, %4dx%4d (GB/s)", + sizes[NUM_SIZES - 1], sizes[NUM_SIZES - 1], numBufs_, + types[typeIdx_], width_, width_); + + _perfInfo = (float)perf; + testDescString = buf; +} + +unsigned int OCLPerfSampleRate::close(void) { + _wrapper->clFinish(cmd_queue_); + + if (inBuffer_) { + for (unsigned int i = 0; i < numBufs_; i++) { + if (inBuffer_[i]) { + error_ = _wrapper->clReleaseMemObject(inBuffer_[i]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(inBuffer_) failed"); + } + } + free(inBuffer_); + } + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSampleRate.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSampleRate.h new file mode 100644 index 0000000000..93bf6dc6a1 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSampleRate.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_SAMPLERATE_H_ +#define _OCL_SAMPLERATE_H_ + +#include "OCLTestImp.h" + +class OCLPerfSampleRate : public OCLTestImp { + public: + OCLPerfSampleRate(); + virtual ~OCLPerfSampleRate(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void setData(cl_mem buffer, unsigned int data); + void checkData(cl_mem buffer); + void setKernel(void); + + cl_context context_; + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + cl_mem* inBuffer_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int width_; + unsigned int bufSize_; + unsigned int outBufSize_; + static const unsigned int MAX_ITERATIONS = 25; + unsigned int numBufs_; + unsigned int typeIdx_; + + bool skip_; +}; + +#endif // _OCL_SAMPLERATE_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfScalarReplArrayElem.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfScalarReplArrayElem.cpp new file mode 100644 index 0000000000..922ae44025 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfScalarReplArrayElem.cpp @@ -0,0 +1,325 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfScalarReplArrayElem.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 1 +static const unsigned int Sizes[NUM_SIZES] = {16777216}; // 16 + +static void genKernelSource(const char *vtypeName, unsigned arrayLen, + unsigned loopCount, char *source) { + sprintf(source, + "%s foo(uint lid, __local %s *localLocal)\n" + "{\n" + " %s val0 = 0.0f;\n" + " %s val1 = 0.0f;\n" + " for (int i = 0; i < %d; ++i) {\n" + " val0 += localLocal[lid];\n" + " lid += 16;\n" + " }\n" + " %s val = val0+val1;\n" + " return val;\n" + "}\n" + "__kernel __attribute__((reqd_work_group_size(64,1,1)))" + " void _ldsReadSpeed(__global %s *outBuf)\n" + "{\n" + " uint gid = (int) get_global_id(0);\n" + " uint lid = (int) get_local_id(0);\n" + " __local %s localLocal[%d];\n" + " outBuf[gid] = foo(lid, localLocal);\n" + "}\n", + vtypeName, vtypeName, vtypeName, vtypeName, loopCount, vtypeName, + vtypeName, vtypeName, arrayLen); +} + +typedef struct { + const char *name; + unsigned nBytes; +} ExplicitType; + +static const ExplicitType tyChar = {"char", 1}; +static const ExplicitType tyShort = {"short", 2}; +static const ExplicitType tyInt = {"int", 4}; +static const ExplicitType tyLong = {"long", 8}; +static const ExplicitType tyFloat = {"float", 4}; +static const ExplicitType tyDouble = {"double", 8}; + +typedef struct { + ExplicitType elemType; + unsigned nElems; + const char *name; + unsigned getSize() const { return elemType.nBytes * nElems; } +} VectorType; + +static const VectorType vecTypes[] = { + {tyChar, 8, "char8"}, {tyShort, 4, "short4"}, {tyInt, 2, "int2"}, + {tyFloat, 2, "float2"}, {tyLong, 1, "long"}, + + {tyChar, 16, "char16"}, {tyShort, 8, "short8"}, {tyInt, 4, "int4"}, + {tyFloat, 4, "float4"}, {tyLong, 2, "long2"}, + + {tyShort, 16, "short16"}, {tyInt, 8, "int8"}, {tyFloat, 8, "float8"}, + {tyLong, 4, "long4"}, + + {tyInt, 16, "int16"}, {tyFloat, 16, "float16"}, {tyLong, 8, "long8"}, + + {tyLong, 16, "long16"}}; +static const unsigned ldsBytes = 4 * 4096; +static const unsigned nVecTypes = sizeof(vecTypes) / sizeof(VectorType); + +void OCLPerfScalarReplArrayElem::genShader(unsigned int idx) { + VectorType vecType = vecTypes[idx]; + ExplicitType elemType = vecType.elemType; + unsigned vecSize = vecType.nElems; + unsigned arrayLen = ldsBytes / vecType.getSize(); + unsigned loopCount = arrayLen / 16; + char source[7192]; + genKernelSource(vecType.name, arrayLen, loopCount, source); + shader_ = std::string(source); + numReads_ = loopCount; + itemWidth_ = vecType.getSize(); +} + +OCLPerfScalarReplArrayElem::OCLPerfScalarReplArrayElem() { + _numSubTests = NUM_SIZES * nVecTypes; +} + +OCLPerfScalarReplArrayElem::~OCLPerfScalarReplArrayElem() {} + +void OCLPerfScalarReplArrayElem::setData(cl_mem buffer, float val) { + float *data = (float *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, + CL_MAP_WRITE, 0, bufSize_, + 0, NULL, NULL, &error_); + for (unsigned int i = 0; i < (bufSize_ >> 2); i++) data[i] = val; + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); +} + +void OCLPerfScalarReplArrayElem::checkData(cl_mem buffer) { + float *data = (float *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, + CL_MAP_READ, 0, bufSize_, + 0, NULL, NULL, &error_); + for (unsigned int i = 0; i < (bufSize_ >> 2); i++) { + if (data[i] != (float)numReads_) { + printf("Data validation failed at index %d!\n", i); + printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, numReads_, + numReads_, numReads_, (unsigned int)data[i], + (unsigned int)data[i + 1], (unsigned int)data[i + 2], + (unsigned int)data[i + 3]); + CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n"); + break; + } + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfScalarReplArrayElem::open(unsigned int test, char *units, + double &conversion, + unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + + context_ = 0; + cmd_queue_ = 0; + program_ = 0; + kernel_ = 0; + outBuffer_ = 0; + _openTest = test; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + // if (num_devices > 0) + //{ + // platform = platforms[_platformIndex]; + // break; + //} +#if 0 + } +#endif + delete platforms; + } + + width_ = Sizes[test % NUM_SIZES]; + shaderIdx_ = test / NUM_SIZES; + bufSize_ = width_; + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + genShader(shaderIdx_); + char *tmp = (char *)shader_.c_str(); + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&tmp, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &device, "", NULL, NULL); + + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "_ldsReadSpeed", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer_); + + // setData(outBuffer_, 1.2345678f); +} + +void OCLPerfScalarReplArrayElem::run(void) { + int global = bufSize_ / itemWidth_; + int local = 64; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + + CPerfCounter timer; + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < NUM_ITER; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + } + _wrapper->clFinish(cmd_queue_); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Constant bandwidth in GB/s + double perf = + ((double)global * numReads_ * itemWidth_ * NUM_ITER * (double)(1e-09)) / + sec; + + _perfInfo = (float)perf; + char buf[256]; + SNPRINTF(buf, sizeof(buf), " %10s %8d threads, %4d reads (GB/s)", + vecTypes[shaderIdx_].name, global, numReads_); + testDescString = buf; + // checkData(outBuffer_); +} + +unsigned int OCLPerfScalarReplArrayElem::close(void) { + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfScalarReplArrayElem.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfScalarReplArrayElem.h new file mode 100644 index 0000000000..f931c2fc18 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfScalarReplArrayElem.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_ScalarReplArrayElem_H_ +#define _OCL_ScalarReplArrayElem_H_ + +#include "OCLTestImp.h" + +class OCLPerfScalarReplArrayElem : public OCLTestImp { + public: + OCLPerfScalarReplArrayElem(); + virtual ~OCLPerfScalarReplArrayElem(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void genShader(unsigned int idx); + void setData(cl_mem buffer, float data); + void checkData(cl_mem buffer); + + static const unsigned int NUM_ITER = 100; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int width_; + unsigned int bufSize_; + unsigned int numReads_; + unsigned int shaderIdx_; + unsigned int itemWidth_; + unsigned int vecTypeIdx_; + unsigned int vecSizeIdx_; +}; + +#endif // _OCL_ScalarReplArrayElem_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSdiP2PCopy.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSdiP2PCopy.cpp new file mode 100644 index 0000000000..5cad2e0a51 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSdiP2PCopy.cpp @@ -0,0 +1,261 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfSdiP2PCopy.h" + +#include + +#include "Timer.h" + +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 5 +// 64KB, 256KB, 1 MB, 4MB, 16 MB +static const unsigned int Sizes[NUM_SIZES] = {65536, 262144, 1048576, 4194304, + 16777216}; + +OCLPerfSdiP2PCopy::OCLPerfSdiP2PCopy() { + // If there are two different gpus in the system, + // we have to test each of them + _numSubTests = 2 * NUM_SIZES; +} + +OCLPerfSdiP2PCopy::~OCLPerfSdiP2PCopy() {} + +void OCLPerfSdiP2PCopy::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + cl_uint numPlatforms = 0; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + _crcword = 0; + conversion = 1.0f; + _openTest = test % NUM_SIZES; + bufSize_ = Sizes[_openTest]; + error_ = 0; + srcBuff_ = 0; + inputArr_ = 0; + outputArr_ = 0; + extPhysicalBuff_ = 0; + silentFailure = false; + busAddressableBuff_ = 0; + devices_[0] = devices_[1] = 0; + contexts_[0] = contexts_[1] = 0; + cmd_queues_[0] = cmd_queues_[1] = 0; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(numPlatforms == 0, "clGetPlatformIDs failed"); + error_ = _wrapper->clGetPlatformIDs(1, &platform, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + error_ = _wrapper->clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, + &num_devices); + if (num_devices != 2) { + printf( + "\nSilent Failure: Two GPUs are required to run OCLPerfSdiP2PCopy " + "test\n"); + silentFailure = true; + return; + } + error_ = _wrapper->clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, + devices_, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + if (test >= NUM_SIZES) { + cl_device_id temp = devices_[0]; + devices_[0] = devices_[1]; + devices_[1] = temp; + } + size_t param_size = 0; + char* strExtensions = 0; + error_ = _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_EXTENSIONS, 0, 0, + ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strExtensions = (char*)malloc(param_size); + error_ = _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_EXTENSIONS, + param_size, strExtensions, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (strstr(strExtensions, "cl_amd_bus_addressable_memory") == 0) { + printf( + "\nSilent Failure: cl_amd_bus_addressable_memory extension is not " + "enabled on GPU 0\n"); + silentFailure = true; + free(strExtensions); + return; + } + free(strExtensions); + error_ = _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_EXTENSIONS, 0, 0, + ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strExtensions = (char*)malloc(param_size); + error_ = _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_EXTENSIONS, + param_size, strExtensions, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (strstr(strExtensions, "cl_amd_bus_addressable_memory") == 0) { + printf( + "\nSilent Failure: cl_amd_bus_addressable_memory extension is not " + "enabled on GPU 1\n"); + silentFailure = true; + free(strExtensions); + return; + } + free(strExtensions); + deviceNames_ = " ["; + param_size = 0; + char* strDeviceName = 0; + error_ = + _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_NAME, 0, 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strDeviceName = (char*)malloc(param_size); + error_ = _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_NAME, param_size, + strDeviceName, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + deviceNames_ = deviceNames_ + strDeviceName; + free(strDeviceName); + error_ = + _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_NAME, 0, 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strDeviceName = (char*)malloc(param_size); + error_ = _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_NAME, param_size, + strDeviceName, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + deviceNames_ = deviceNames_ + "->"; + deviceNames_ = deviceNames_ + strDeviceName; + free(strDeviceName); + deviceNames_ = deviceNames_ + "]"; + cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)platform, 0}; + + contexts_[0] = + _wrapper->clCreateContext(props, 1, &devices_[0], 0, 0, &error_); + CHECK_RESULT(contexts_[0] == 0, "clCreateContext failed"); + contexts_[1] = + _wrapper->clCreateContext(props, 1, &devices_[1], 0, 0, &error_); + CHECK_RESULT(contexts_[1] == 0, "clCreateContext failed"); + cmd_queues_[0] = + _wrapper->clCreateCommandQueue(contexts_[0], devices_[0], 0, NULL); + CHECK_RESULT(cmd_queues_[0] == 0, "clCreateCommandQueue failed"); + cmd_queues_[1] = + _wrapper->clCreateCommandQueue(contexts_[1], devices_[1], 0, NULL); + CHECK_RESULT(cmd_queues_[1] == 0, "clCreateCommandQueue failed"); + busAddressableBuff_ = _wrapper->clCreateBuffer( + contexts_[0], CL_MEM_BUS_ADDRESSABLE_AMD, bufSize_, 0, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed"); + error_ = _wrapper->clEnqueueMakeBuffersResidentAMD( + cmd_queues_[0], 1, &busAddressableBuff_, true, &busAddr_, 0, 0, 0); + CHECK_RESULT((error_ != CL_SUCCESS), + "clEnqueueMakeBuffersResidentAMD failed"); + extPhysicalBuff_ = _wrapper->clCreateBuffer( + contexts_[1], CL_MEM_EXTERNAL_PHYSICAL_AMD, bufSize_, &busAddr_, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed"); + srcBuff_ = _wrapper->clCreateBuffer(contexts_[1], CL_MEM_READ_WRITE, bufSize_, + 0, &error_); + CHECK_RESULT(error_ != CL_SUCCESS, "clCreateBuffer failed"); + inputArr_ = (cl_uint*)malloc(bufSize_); + outputArr_ = (cl_uint*)malloc(bufSize_); + for (unsigned int i = 0; i < (bufSize_ / sizeof(cl_uint)); ++i) { + inputArr_[i] = i + 1; + outputArr_[i] = 0; + } + error_ = _wrapper->clEnqueueWriteBuffer(cmd_queues_[1], srcBuff_, CL_TRUE, 0, + bufSize_, inputArr_, 0, 0, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteBuffer failed"); +} + +void OCLPerfSdiP2PCopy::run(void) { + if (silentFailure) { + return; + } + CPerfCounter timer; + // Warm up + error_ = + _wrapper->clEnqueueCopyBuffer(cmd_queues_[1], srcBuff_, extPhysicalBuff_, + 0, 0, bufSize_, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueCopyBuffer failed"); + error_ = _wrapper->clFinish(cmd_queues_[1]); + CHECK_RESULT(error_, "clFinish failed"); + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < NUM_ITER; i++) { + error_ = _wrapper->clEnqueueCopyBuffer(cmd_queues_[1], srcBuff_, + extPhysicalBuff_, 0, 0, bufSize_, 0, + NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueCopyBuffer failed"); + } + error_ = _wrapper->clFinish(cmd_queues_[1]); + CHECK_RESULT(error_, "clFinish failed"); + timer.Stop(); + double sec = timer.GetElapsedTime(); + error_ = _wrapper->clEnqueueReadBuffer(cmd_queues_[0], busAddressableBuff_, + CL_TRUE, 0, bufSize_, outputArr_, 0, 0, + NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteBuffer failed"); + CHECK_RESULT((memcmp(inputArr_, outputArr_, bufSize_) != 0), "copy failed"); + // Buffer copy bandwidth in GB/s + double perf = ((double)bufSize_ * NUM_ITER * (double)(1e-09)) / sec; + _perfInfo = (float)perf; + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%8d bytes) i:%4d (GB/s) %s", bufSize_, NUM_ITER, + deviceNames_.c_str()); + testDescString = buf; +} + +unsigned int OCLPerfSdiP2PCopy::close(void) { + if (srcBuff_) { + error_ = _wrapper->clReleaseMemObject(srcBuff_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject failed"); + } + if (extPhysicalBuff_) { + error_ = _wrapper->clReleaseMemObject(extPhysicalBuff_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject failed"); + } + if (busAddressableBuff_) { + error_ = _wrapper->clReleaseMemObject(busAddressableBuff_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject failed"); + } + if (cmd_queues_[0]) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queues_[0]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (cmd_queues_[1]) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queues_[1]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (contexts_[0]) { + error_ = _wrapper->clReleaseContext(contexts_[0]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + if (contexts_[1]) { + error_ = _wrapper->clReleaseContext(contexts_[1]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + if (inputArr_) { + free(inputArr_); + } + if (outputArr_) { + free(outputArr_); + } + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSdiP2PCopy.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSdiP2PCopy.h new file mode 100644 index 0000000000..be0ef5e7b0 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSdiP2PCopy.h @@ -0,0 +1,52 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_SdiP2PCopy_H_ +#define _OCL_SdiP2PCopy_H_ + +#include "OCLTestImp.h" + +class OCLPerfSdiP2PCopy : public OCLTestImp { + public: + OCLPerfSdiP2PCopy(); + virtual ~OCLPerfSdiP2PCopy(); + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + static const unsigned int NUM_ITER = 1024; + bool silentFailure; + cl_context contexts_[2]; + cl_device_id devices_[2]; + cl_command_queue cmd_queues_[2]; + cl_mem srcBuff_; + cl_mem extPhysicalBuff_; + cl_mem busAddressableBuff_; + cl_int error_; + cl_bus_address_amd busAddr_; + cl_uint* inputArr_; + cl_uint* outputArr_; + unsigned int bufSize_; + std::string deviceNames_; +}; + +#endif // _OCL_SdiP2PCopy_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSepia.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSepia.cpp new file mode 100644 index 0000000000..746cfbecb5 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSepia.cpp @@ -0,0 +1,586 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfSepia.h" + +#include +#include +#include +#include + +#define WIDTH 1024 +#define HEIGHT 1024 + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define MAX(a, b) (a > b ? a : b) + +const char *sepiaVertexProgram = + "!!ARBvp1.0\n" + "\n" + "\n" + "OPTION ARB_position_invariant;\n" + "\n" + "PARAM p0 = program.local[2];\n" + "PARAM p1 = program.local[3];\n" + "ATTRIB a0 = vertex.texcoord[0];\n" + "OUTPUT o0 = result.texcoord[0];\n" + "OUTPUT o1 = result.texcoord[1];\n" + "TEMP r0, r1;\n" + "\n" + "MOV o0, a0;\n" + "#SWZ r1, a0, x, y, 0, 0;\n" + "#DPH r0.x, r1, p0;\n" + "#DPH r0.y, r1, p1;\n" + "#MOV o1, r0;\n" + "MOV o1, a0;\n" + "\n" + "END\n"; + +const char *sepiaFragmentProgram = + "!!ARBfp1.0\n" + "\n" + "\n" + "PARAM p0 = {1e-4, 0.085, 0.0, 0.0};\n" + "PARAM p1 = {0.2125, 0.7154, 0.0721, 0.0};\n" + "PARAM p2 = {-3605.984, 0.1323156, 0.0, -0.1991615};\n" + "PARAM p3 = {708.7939, -0.3903106, -0.05854013, 0.6621023};\n" + "PARAM p4 = {-50.93341, 0.4654831, 1.027555, -0.9069088};\n" + "PARAM p5 = {3.116672, 0.7926372, 0.03219686, 1.411847};\n" + "PARAM p6 = {8.95663e-4, -0.001104567, -6.0827e-4, 0.03277428};\n" + "PARAM p7 = program.local[0];\n" + "PARAM p8 = program.local[1];\n" + "ATTRIB a0 = fragment.texcoord[1];\n" + "OUTPUT o0 = result.color;\n" + "TEMP r0, r1, r2, r3;\n" + "\n" + "TEX r1, a0, texture[0], RECT;\n" + "#MAX r0, p0.x, r1.w;\n" + "#RCP r2, r0.x;\n" + "#DP3 r3, r1, p1;\n" + "#MUL r0, r3, r2;\n" + "#MAD r2, r0, p2, p3;\n" + "#MAD r2, r2, r0, p4;\n" + "#MAD r0, r2, r0, p5;\n" + "#MUL r2, r1.w, p6;\n" + "#MAD r2, r0, r3, r2;\n" + "#MAD r0, r1.w, p0.y, -r3;\n" + "#CMP r2.x, -r0, r2.x, r2.w;\n" + "#MAD r0, r3, r3, -r3;\n" + "#CMP r0, r0.x, r2, r3;\n" + "#MOV r0.w, r1;\n" + "#MUL r0, r0, p7;\n" + "#LRP o0, p8.x, r0, r1;\n" + "MOV o0, r1;\n" + "\n" + "END\n"; + +const static char *strKernel = + "\n" + "__kernel void program(write_only image2d_t dest, int flipped, int4 dim, " + "float2 st_origin, float4 st_delta, float4 l0, float4 l1, float4 l2, " + "float4 l3, read_only image2d_t t0, sampler_t t_sampler0)\n" + "{\n" + " const sampler_t sam = CLK_NORMALIZED_COORDS_FALSE | " + "CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n" + "// const float4 p0 = (float4)( 0x1.b33334p-3, 0x1.6e48e8p-1, " + "0x1.275254p-4, 0x0p+0 );\n" + "// const float4 p1 = (float4)( 0x1.a36e2ep-14, 0x1.5c28f6p-4, 0x0p+0, " + "0x0p+0 );\n" + "// const float4 p2 = (float4)( 0x1.d595dap-11, -0x1.218e3cp-10, " + "-0x1.3ee89ep-11, 0x1.0c7ca6p-5 );\n" + "// const float4 p3 = (float4)( -0x1.c2bf7cp+11, 0x1.0efb7cp-3, " + "0x0p+0, -0x1.97e1fcp-3 );\n" + "// const float4 p4 = (float4)( 0x1.62659ep+9, -0x1.8fad94p-2, " + "-0x1.df8f8cp-5, 0x1.52ff12p-1 );\n" + "// const float4 p5 = (float4)( -0x1.9777ap+5, 0x1.dca79ap-2, " + "0x1.070dd8p+0, -0x1.d0565ap-1 );\n" + "// const float4 p6 = (float4)( 0x1.8eef1cp+1, 0x1.95d48cp-1, " + "0x1.07c1b6p-5, 0x1.696ecep+0 );\n" + "// int dest_width = dim.x;\n" + "// int dest_height = dim.y;\n" + " float4 o0, r0, r1, r2, r3, r4;\n" + "// float4 false_vector = (float4) 0.0f;\n" + "// float4 true_vector = (float4) 1.0f;\n" + " int2 loc = (int2)( get_global_id(0), get_global_id(1) );\n" + "// if ((loc.x >= dim.x) || loc.y >= dim.y) return;\n" + "// float4 f0 = (float4)( st_origin.x + ((float)loc.x + 0.5f) * " + "st_delta.x + ((float)loc.y + 0.5f) * st_delta.z, st_origin.y + " + "((float)loc.x + 0.5f) * st_delta.y + ((float)loc.y + 0.5f) * st_delta.w, " + "0.0f, 0.0f );\n" + "// r2 = f0;\n" + "// r0.x = dot(r2.xy,l2.xy) + l2.w;\n" + "// r0.y = dot(r2.xy,l3.xy) + l3.w;\n" + "// r4 = r0;\n" + " r1 = read_imagef(t0, sam/*t_sampler0*/, r4.xy);\n" + "// r3 = dot(r1.xyz,p0.xyz);\n" + "// r2 = max(p1.xxxx, r1.wwww);\n" + "// r0 = native_recip(r2.xxxx);\n" + "// r4 = r3*r0;\n" + "// r2 = r1.wwww*p2;\n" + "// r0 = mad(r4,p3,p4);\n" + "// r0 = mad(r0,r4,p5);\n" + "// r0 = mad(r0,r4,p6);\n" + "// r2 = mad(r0,r3,r2);\n" + "// r0 = mad(r1.wwww,p1.yyyy,-r3);\n" + "// r2.x = select(r2.w,r2.x, isless(-r0.x, 0.0f));\n" + "// r0 = mad(r3,r3,-r3);\n" + "// r0 = select(r3,r2, isless(r0.xxxx, 0.0f));\n" + "// r0.w = r1.w;\n" + "// r0 = r0*l0;\n" + "// r0 = mix(r1,r0, l1.xxxx);\n" + "// r0.xyz = min(r0.xyz, r0.www);\n" + "// o0 = r0;\n" + " write_imagef(dest, loc /*(int2)( loc.x + dim.z , flipped ? " + "get_image_height(dest) - (loc.y + dim.w + 1) : loc.y + dim.w )*/, r1 " + "/*o0*/);\n" + "}\n"; + +OCLPerfSepia::OCLPerfSepia() { _numSubTests = 2; } + +OCLPerfSepia::~OCLPerfSepia() {} + +void OCLPerfSepia::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + bVerify_ = false; + silentFailure_ = false; + iterations_ = 50000; + bpr_ = 0; + data_ = 0; + result_ = 0; + width_ = 0; + height_ = 0; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + texId = 0; + format_.image_channel_order = CL_RGBA; + format_.image_channel_data_type = CL_UNORM_INT8; + + srand(0x8956); // some constant instead of time() so that we get same random + // numbers + + if (!IsGLEnabled(test, units, conversion, deviceId)) { + silentFailure_ = true; + return; + } + OCLGLCommon::open(test, units, conversion, deviceId); + if (_errorFlag) return; + if (test == 0) { + // Build the kernel + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, + NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateProgramWithSource() failed (%d)", error_); + const char *optionsGPU = "-cl-denorms-are-zero -cl-mad-enable"; + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], + optionsGPU, NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, + 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", + error_); + + kernel_ = _wrapper->clCreateKernel(program_, "program", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", + error_); + } +} + +void OCLPerfSepia::populateData(void) { + width_ = WIDTH; + height_ = HEIGHT; + bpr_ = 4 * width_; + data_ = (cl_uchar *)malloc(height_ * bpr_); + for (unsigned int n = 0; n < (height_ * bpr_); n++) { + data_[n] = (n & 3) ? (rand() % 256) : 0xFF; + } +} + +void OCLPerfSepia::runGL(void) { + glDisable(GL_ALPHA_TEST); + glDisable(GL_DEPTH_TEST); + glDisable(GL_SCISSOR_TEST); + glDisable(GL_BLEND); + glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); + glDisable(GL_DITHER); + glDisable(GL_CULL_FACE); + glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); + glDepthMask(GL_FALSE); + glStencilMask(0); + + glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE); + + // let's create the textures we need + + glEnable(GL_TEXTURE_RECTANGLE_EXT); + glGenTextures(1, &texId); + glBindTexture(GL_TEXTURE_RECTANGLE_EXT, texId); + + // have GL alloc memory for us for our destination texture which we will be + // rendering into + glTexImage2D(GL_TEXTURE_RECTANGLE_EXT, 0, GL_RGBA, width_, height_, 0, + GL_BGRA /*RGBA*/, GL_UNSIGNED_INT_8_8_8_8_REV, NULL); + glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + + // for the source texture we will provide a data ptr and hang on to it + GLuint srcTexture; + + glGenTextures(1, &srcTexture); + glBindTexture(GL_TEXTURE_RECTANGLE_EXT, srcTexture); + + glPixelStorei(GL_UNPACK_ROW_LENGTH, width_); + glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, height_); + glPixelStorei(GL_UNPACK_ALIGNMENT, 8); + + // XXX Alex -- use optimal texture upload format. + glTexImage2D(GL_TEXTURE_RECTANGLE_EXT, 0, GL_RGBA, width_, height_, 0, + GL_BGRA, /* GL_RGBA,*/ + format_.image_channel_order == CL_RGBA + ? GL_UNSIGNED_INT_8_8_8_8 + : GL_UNSIGNED_INT_8_8_8_8_REV, + data_); + + glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_S, + GL_CLAMP_TO_EDGE); + glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_T, + GL_CLAMP_TO_EDGE); + glPixelStorei(GL_UNPACK_SWAP_BYTES, 0); + glPixelStorei(GL_UNPACK_LSB_FIRST, 0); + glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); + glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, 0); + glPixelStorei(GL_UNPACK_SKIP_PIXELS, 0); + glPixelStorei(GL_UNPACK_SKIP_IMAGES, 0); + glPixelStorei(GL_UNPACK_SKIP_ROWS, 0); + glPixelStorei(GL_UNPACK_ALIGNMENT, 4); + + GLuint vertexProgram; + GLuint fragmentProgram; + + glGenProgramsARB(1, &vertexProgram); + glGenProgramsARB(1, &fragmentProgram); + + glBindProgramARB(GL_VERTEX_PROGRAM_ARB, vertexProgram); + glProgramStringARB(GL_VERTEX_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB, + (GLsizei)strlen(sepiaVertexProgram), sepiaVertexProgram); + + glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, fragmentProgram); + glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB, + (GLsizei)strlen(sepiaFragmentProgram), + sepiaFragmentProgram); + + GLfloat l0[] = {1.0f, 0.99f, 0.92f, 1.0f}; + GLfloat l1[] = {0.5, 0, 0, 0}; + GLfloat l2[] = {1, 0, 0, 0}; + GLfloat l3[] = {0, -1, 0, (GLfloat)height_}; + + glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, 0, l0); + glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, 1, l1); + glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, 2, l2); + glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, 3, l3); + + glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 0, l0); + glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 1, l1); + glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 2, l2); + glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 3, l3); + + GLuint fbo; + + glGenFramebuffersEXT(1, &fbo); + + glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, fbo); + + glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT, + GL_TEXTURE_RECTANGLE_ARB, texId, 0); + glViewport(0, 0, width_, height_); + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + glOrtho(0, width_, 0, height_, -1, 1); + glClearColor(0, 0, 0, 0); + glClear(GL_COLOR_BUFFER_BIT); + glDisable(GL_BLEND); + + glEnable(GL_VERTEX_PROGRAM_ARB); + glEnable(GL_FRAGMENT_PROGRAM_ARB); + + // warm up + for (unsigned int k = 0; k < (iterations_ / 10); k++) { + glBegin(GL_QUADS); + glTexCoord2f(0, 0); + glVertex2f(0, (GLfloat)height_); + glTexCoord2f((GLfloat)width_, 0); + glVertex2f((GLfloat)width_, (GLfloat)height_); + glTexCoord2f((GLfloat)width_, (GLfloat)height_); + glVertex2f((GLfloat)width_, 0); + glTexCoord2f(0, (GLfloat)height_); + glVertex2f(0, 0); + glEnd(); + glFlush(); + glFinish(); + } + + // actual test + for (unsigned int k = 0; k < iterations_; k++) { + if (k == 1) { + timer_.Reset(); + timer_.Start(); + } + + glBegin(GL_QUADS); + glTexCoord2f(0, 0); + glVertex2f(0, (GLfloat)height_); + glTexCoord2f((GLfloat)width_, 0); + glVertex2f((GLfloat)width_, (GLfloat)height_); + glTexCoord2f((GLfloat)width_, (GLfloat)height_); + glVertex2f((GLfloat)width_, 0); + glTexCoord2f(0, (GLfloat)height_); + glVertex2f(0, 0); + glEnd(); + } + + glFlush(); + glFinish(); + + timer_.Stop(); + + glDisable(GL_VERTEX_PROGRAM_ARB); + glDisable(GL_FRAGMENT_PROGRAM_ARB); + + // now let's read back the pixels + result_ = (cl_uchar *)malloc(width_ * height_ * 4); + + glReadPixels(0, 0, width_, height_, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, + result_); + + // bind back default frame buffer + glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0); + + glDeleteFramebuffersEXT(1, &fbo); + glDeleteTextures(1, &srcTexture); + glDeleteProgramsARB(1, &vertexProgram); + glDeleteProgramsARB(1, &fragmentProgram); +} + +void OCLPerfSepia::runCL(void) { + cl_mem dst, src; + cl_sampler nearestZero; + + glEnable(GL_TEXTURE_RECTANGLE_EXT); + glGenTextures(1, &texId); + glBindTexture(GL_TEXTURE_RECTANGLE_EXT, texId); + // XXX Alex: have GL alloc memory for us ... + glTexImage2D(GL_TEXTURE_RECTANGLE_EXT, 0, GL_RGBA, width_, height_, 0, + GL_RGBA /*BGRA*/, GL_UNSIGNED_INT_8_8_8_8_REV, NULL); + + dst = _wrapper->clCreateFromGLTexture2D( + context_, CL_MEM_READ_WRITE, GL_TEXTURE_RECTANGLE_EXT, 0, texId, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateFromGLTexture2D error (%d)", + error_); + nearestZero = _wrapper->clCreateSampler(context_, CL_FALSE, CL_ADDRESS_CLAMP, + CL_FILTER_NEAREST, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateSampler error (%d)", error_); + src = _wrapper->clCreateImage2D( + context_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &format_, width_, + height_, bpr_, data_, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateImage2D error (%d)", error_); + + int numArgs = 0; + int dim[2] = {(int)width_, (int)height_}; + int flipped[] = {1}; + int dims[] = {(int)width_, (int)height_, 0, 0}; + float st_origin[] = {0, 0}; + float st_delta[] = {1, 0, 0, 1}; + + _wrapper->clSetKernelArg(kernel_, numArgs++, sizeof(cl_mem), + &dst); // arg is a image2DGL named "dst" + _wrapper->clSetKernelArg(kernel_, numArgs++, sizeof(int), + &flipped); // arg is a int1 named "flipped" + _wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(int), + &dims); // arg is a int4 named "dim" + _wrapper->clSetKernelArg(kernel_, numArgs++, 2 * sizeof(float), + &st_origin); // arg is a float2 named "st_origin" + _wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float), + &st_delta); // arg is a float4 named "st_delta" + + float l0[] = {1.0f, 0.99f, 0.92f, 1.0f}; + float l1[] = {0.5f, 0.0f, 0.0f, 0.0f}; + float l2[] = {1.0f, 0.0f, 0.0f, 0.0f}; + float l3[] = {0.0f, -1.0f, 0.0f, (float)height_}; + + _wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float), + &l0); // arg is a float4 named "l0" + _wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float), + &l1); // arg is a float4 named "l1" + _wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float), + &l2); // arg is a float4 named "l2" + _wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float), + &l3); // arg is a float4 named "l3" + _wrapper->clSetKernelArg(kernel_, numArgs++, sizeof(cl_mem), + &src); // arg is a image2D named "t0" + _wrapper->clSetKernelArg( + kernel_, numArgs++, sizeof(cl_sampler), + &nearestZero); // arg is a sampler named "t_sampler0" + + size_t execution_threads[2]; + size_t execution_local[2]; + cl_uint work_dim = 2; + error_ = _wrapper->clGetKernelWorkGroupInfo( + kernel_, devices_[_deviceId], CL_KERNEL_WORK_GROUP_SIZE, + sizeof(execution_local[0]), &execution_local[0], 0); + CHECK_RESULT((error_ != CL_SUCCESS), "clGetKernelWorkGroupInfo error (%d)", + error_); + execution_local[1] = 1; + work_dim = 2; + GetKernelExecDimsForImage((unsigned int)execution_local[0], dim[0], dim[1], + execution_threads, execution_local); + result_ = (cl_uchar *)malloc(height_ * bpr_); + + const size_t origin[] = {0, 0, 0}; + const size_t region[] = {width_, height_, 1}; + + // warm up + for (unsigned int k = 0; k < (iterations_ / 10); k++) { + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, + work_dim, NULL, execution_threads, + execution_local, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel error (%d)", + error_); + error_ = _wrapper->clFinish(cmdQueues_[_deviceId]); + CHECK_RESULT((error_ != CL_SUCCESS), "clFinish error (%d)", error_); + } + + // actual test + for (unsigned int k = 0; k < iterations_; k++) { + if (k == 1) { + timer_.Reset(); + timer_.Start(); + } + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, + work_dim, NULL, execution_threads, + execution_local, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel error (%d)", + error_); + } + error_ = _wrapper->clFinish(cmdQueues_[_deviceId]); + CHECK_RESULT((error_ != CL_SUCCESS), "clFinish error (%d)", error_); + + timer_.Stop(); + + error_ = + _wrapper->clEnqueueReadImage(cmdQueues_[_deviceId], dst, true, origin, + region, bpr_, 0, result_, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadImage error (%d)", error_); + _wrapper->clFinish(cmdQueues_[_deviceId]); + + _wrapper->clReleaseMemObject(src), src = NULL; + _wrapper->clReleaseSampler(nearestZero); + _wrapper->clReleaseMemObject(dst), dst = NULL; +} + +void OCLPerfSepia::GetKernelExecDimsForImage(unsigned int work_group_size, + unsigned int w, unsigned int h, + size_t *global, size_t *local) { + unsigned int a, b; + static const unsigned int tile_size = 16; + + // local[0] and local[1] must be at least 1 + local[0] = tile_size < work_group_size ? tile_size : work_group_size; + local[1] = work_group_size / tile_size > tile_size + ? tile_size + : MAX(work_group_size / tile_size, 1); + + a = w; + b = (unsigned int)local[0]; + + global[0] = ((a % b) != 0) ? (a / b + 1) : (a / b); + global[0] *= local[0]; + + a = h; + b = (unsigned int)local[1]; + + global[1] = ((a % b) != 0) ? (a / b + 1) : (a / b); + global[1] *= local[1]; +} + +void OCLPerfSepia::run(void) { + if (_errorFlag || silentFailure_) { + return; + } + populateData(); + if (_openTest == 0) { + runCL(); + } else { + runGL(); + } + if (bVerify_) { + verifyResult(); + } + char buf[100]; + SNPRINTF(buf, sizeof(buf), "%s iterations# %d", + (_openTest == 0) ? "CL" : "GL", iterations_); + testDescString = buf; + _perfInfo = (float)timer_.GetElapsedTime(); +} + +void OCLPerfSepia::verifyResult(void) { + int r = 0, g = 0, b = 0, a = 0, d = 0; + for (unsigned int k = 0; k < height_ * bpr_; k += 4) { + a = a + result_[k + 0]; + r = r + result_[k + 1]; + g = g + result_[k + 2]; + b = b + result_[k + 3]; + } + d = abs(r - 152797810) + abs(g - 125868080) + abs(b - 76147833) + + abs(a - 267386880); + CHECK_RESULT(d > 20000, "wrong result"); +} +unsigned int OCLPerfSepia::close(void) { + if (silentFailure_) { + return 0; + } + + if (data_) { + free(data_); + } + + if (result_) { + free(result_); + } + + if (texId) { + glDeleteTextures(1, &texId); + } + + return OCLGLCommon::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSepia.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSepia.h new file mode 100644 index 0000000000..0103060009 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfSepia.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PERF_SEPIA_H_ +#define _OCL_PERF_SEPIA_H_ + +#include "OCLGLCommon.h" +#include "Timer.h" + +class OCLPerfSepia : public OCLGLCommon { + public: + OCLPerfSepia(); + virtual ~OCLPerfSepia(); + + virtual void open(unsigned int test, char *units, double &conversion, + unsigned int deviceId); + virtual void run(void); + virtual unsigned int close(void); + + private: + void runGL(void); + void runCL(void); + void populateData(void); + void verifyResult(void); + void GetKernelExecDimsForImage(unsigned int work_group_size, unsigned int w, + unsigned int h, size_t *global, size_t *local); + + bool silentFailure_; + cl_uint iterations_; + cl_image_format format_; + cl_uchar *data_; + cl_uchar *result_; + bool bVerify_; + cl_uint width_; + cl_uint height_; + cl_uint bpr_; + GLuint texId; + CPerfCounter timer_; +}; + +#endif // _OCL_PERF_SEPIA_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfTextureMemLatency.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfTextureMemLatency.cpp new file mode 100644 index 0000000000..d4e80ba044 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfTextureMemLatency.cpp @@ -0,0 +1,409 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfTextureMemLatency.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +static const unsigned int NUM_SIZES = 13; +// 2k up to 64MB +static const cl_uint2 Dims[NUM_SIZES] = { + {{32, 16}}, {{32, 32}}, {{64, 32}}, {{64, 64}}, {{128, 64}}, + {{128, 128}}, {{256, 128}}, {{256, 256}}, {{512, 256}}, {{512, 512}}, + {{1024, 512}}, {{1024, 1024}}, {{2048, 1024}}}; +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif +void OCLPerfTextureMemLatency::genShader() { + shader_.clear(); + + // Adopted from SiSoft Sandra 2013's memory latency test + shader_ += + "constant sampler_t insample = CLK_NORMALIZED_COORDS_FALSE | " + "CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n" + "__kernel\n" + "__attribute__((work_group_size_hint(1, 1, 1)))\n" + "void MemWalker(\n" + " read_only image2d_t input,\n" + " __global uint * restrict output,\n" + " const uint uCount, const uint uSize,\n" + " const uint4 uOffset, const int bMem, const uint repeats)\n" + "{\n" + " uint4 o = uOffset;\n" + " uint lid = get_local_id(0);\n" + " uint4 x = lid*o;\n" + "\n" + " for (uint loop = 0; (loop < repeats); loop++) {\n" + " uint i = uCount;\n" + " int2 nx = (int2)(0,0);\n" + " nx = (int2)((x.y << 8) | x.x, (x.w << 8) | x.z);\n" + " while (i--) {\n" + " x = read_imageui(input, insample, nx);\n" + " x.x += o.x;\n" + " x.z += o.z;\n" + " nx = (int2)((x.y << 8) | x.x, (x.w << 8) | x.z);\n" + " }\n" + " }\n" + "\n" + " output[0] = x.x + x.y;\n" + "}\n"; + + // printf("shader:\n%s\n", shader_.c_str()); + shader_ += "\n\n"; + shader_ += + "__kernel\n" + "__attribute__((work_group_size_hint(1, 1, 1)))\n" + "void Overhead(\n" + " read_only image2d_t input,\n" + " __global uint * restrict output,\n" + " const uint uCount, const uint uSize,\n" + " const uint4 uOffset, const int bMem, const uint repeats)\n" + "{\n" + " uint4 o = uOffset;\n" + " uint lid = get_local_id(0);\n" + " uint4 x = lid*o;\n" + " x += o;\n" + " int2 nx;\n" + " for (uint loop = 0; loop < repeats; loop++) {\n" + " uint i = uCount;\n" + " nx = (int2)(0,0);\n" + " nx = (int2)((x.y << 8) | x.x, (x.w << 8) | x.z);\n" + " while (i--) {\n" + " x.x = nx.x + o.x;\n" + " x.z = nx.y + o.y;\n" + " nx = (int2)((x.y << 8) | x.x, (x.w << 8) | x.z);\n" + " }\n" + " }\n" + " output[0] = nx.x | nx.y;\n" + "}\n"; +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +OCLPerfTextureMemLatency::OCLPerfTextureMemLatency() { + _numSubTests = NUM_SIZES; + maxSize_ = Dims[NUM_SIZES - 1].s[0] * Dims[NUM_SIZES - 1].s[1]; +} + +OCLPerfTextureMemLatency::~OCLPerfTextureMemLatency() {} + +void OCLPerfTextureMemLatency::setData(cl_mem buffer, unsigned int val) { + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {width_, height_, 1}; + + void *ptr = _wrapper->clEnqueueMapImage( + cmd_queue_, buffer, true, CL_MAP_WRITE, origin, region, &image_row_pitch, + &image_slice_pitch, 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapImage failed."); + unsigned int *data = (unsigned int *)ptr; + unsigned int nextOffset = 0; + for (unsigned int i = 0; i < bufSizeDW_; i++) { + unsigned int offset = ((1024 + 17) * (i + 1)) % bufSizeDW_; + unsigned int x, y; + x = offset % width_; + y = offset / width_; + unsigned int newx, newy; + newx = nextOffset % width_; + newy = nextOffset / width_; + data[newy * image_row_pitch / sizeof(unsigned int) + newx] = + (y << 16) | (x & 0xffff); + nextOffset = offset; + } + error_ = + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL); + clFinish(cmd_queue_); +} + +void OCLPerfTextureMemLatency::checkData(cl_mem buffer) { + void *ptr = + _wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_READ, 0, + sizeof(cl_uint), 0, NULL, NULL, &error_); + + unsigned int *data = (unsigned int *)ptr; + if (data[0] != 0) { + printf("OutData= 0x%08x\n", data[0]); + CHECK_RESULT_NO_RETURN(data[0] != 0, "Data validation failed!\n"); + } + error_ = + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL); + clFinish(cmd_queue_); +} + +void OCLPerfTextureMemLatency::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + + context_ = 0; + cmd_queue_ = 0; + program_ = 0; + kernel_ = 0; + inBuffer_ = 0; + outBuffer_ = 0; + _errorFlag = false; // Reset error code so a single error doesn't prevent + // other subtests from running + _errorMsg = ""; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + } + } + + delete platforms; + } + + width_ = Dims[test % NUM_SIZES].s[0]; + height_ = Dims[test % NUM_SIZES].s[1]; + + bufSizeDW_ = width_ * height_; + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + device = devices[0]; + + free(devices); + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_image_format format = {CL_RGBA, CL_UNSIGNED_INT8}; + inBuffer_ = _wrapper->clCreateImage2D(context_, CL_MEM_READ_ONLY, &format, + width_, height_, 0, NULL, &error_); + CHECK_RESULT(inBuffer_ == 0, "clCreateImage(inBuffer) failed"); + + outBuffer_ = + _wrapper->clCreateBuffer(context_, 0, sizeof(cl_uint), NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + genShader(); + char *tmp = (char *)shader_.c_str(); + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&tmp, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + std::string args; + args.clear(); + + error_ = + _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL); + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "MemWalker", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel(MemWalker) failed"); + + kernel2_ = _wrapper->clCreateKernel(program_, "Overhead", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel(Overhead) failed"); + + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&inBuffer_); + error_ = + _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_); + error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint), + (void *)&bufSizeDW_); + error_ = _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_uint), + (void *)&bufSizeDW_); + cl_uint4 zero; + zero.s[0] = 0; + zero.s[1] = 0; + zero.s[2] = 0; + zero.s[3] = 0; + error_ = + _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_uint4), (void *)&zero); + int bMem = 1; + error_ = _wrapper->clSetKernelArg(kernel_, 5, sizeof(cl_int), (void *)&bMem); + repeats_ = std::max((maxSize_ >> 2) / bufSizeDW_, 1u); + error_ = + _wrapper->clSetKernelArg(kernel_, 6, sizeof(cl_uint), (void *)&repeats_); + + error_ = + _wrapper->clSetKernelArg(kernel2_, 0, sizeof(cl_mem), (void *)&inBuffer_); + error_ = _wrapper->clSetKernelArg(kernel2_, 1, sizeof(cl_mem), + (void *)&outBuffer_); + error_ = _wrapper->clSetKernelArg(kernel2_, 2, sizeof(cl_uint), + (void *)&bufSizeDW_); + error_ = _wrapper->clSetKernelArg(kernel2_, 3, sizeof(cl_uint), + (void *)&bufSizeDW_); + error_ = + _wrapper->clSetKernelArg(kernel2_, 4, sizeof(cl_uint4), (void *)&zero); + error_ = _wrapper->clSetKernelArg(kernel2_, 5, sizeof(cl_int), (void *)&bMem); + error_ = + _wrapper->clSetKernelArg(kernel2_, 6, sizeof(cl_uint), (void *)&repeats_); + + setData(inBuffer_, (int)1.0f); +} + +void OCLPerfTextureMemLatency::run(void) { + int global = 1; + int local = 1; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + + // Warm-up + unsigned int warmup = 128; + error_ = + _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint), (void *)&warmup); + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint), + (void *)&bufSizeDW_); + _wrapper->clFinish(cmd_queue_); + + CPerfCounter timer, timer2; + + timer.Reset(); + timer.Start(); + + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + + _wrapper->clFinish(cmd_queue_); + + timer.Stop(); + + checkData(outBuffer_); + + timer2.Reset(); + timer2.Start(); + + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel2_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + + _wrapper->clFinish(cmd_queue_); + + timer2.Stop(); + double sec = timer.GetElapsedTime() - timer2.GetElapsedTime(); + + // Read latency in ns + double perf = sec * (double)(1e09) / ((double)bufSizeDW_ * (double)repeats_); + + _perfInfo = (float)perf; + char buf[256]; + SNPRINTF(buf, sizeof(buf), "%8d reads, %5d repeats (ns)", bufSizeDW_, + repeats_); + testDescString = buf; +} + +unsigned int OCLPerfTextureMemLatency::close(void) { + _wrapper->clFinish(cmd_queue_); + + if (inBuffer_) { + error_ = _wrapper->clReleaseMemObject(inBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(inBuffer_) failed"); + } + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (kernel2_) { + error_ = _wrapper->clReleaseKernel(kernel2_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfTextureMemLatency.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfTextureMemLatency.h new file mode 100644 index 0000000000..31a1197286 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfTextureMemLatency.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_TEXTUREMEMLATENCY_H_ +#define _OCL_TEXTUREMEMLATENCY_H_ + +#include "OCLTestImp.h" + +class OCLPerfTextureMemLatency : public OCLTestImp { + public: + OCLPerfTextureMemLatency(); + virtual ~OCLPerfTextureMemLatency(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void genShader(void); + void setData(cl_mem buffer, unsigned int data); + void checkData(cl_mem buffer); + + cl_context context_; + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + cl_kernel kernel2_; + cl_mem inBuffer_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int width_; + unsigned int height_; + size_t image_row_pitch; + size_t image_slice_pitch; + unsigned int bufSizeDW_; + unsigned int repeats_; + unsigned int maxSize_; +}; + +#endif // _OCL_TEXTUREMEMLATENCY_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeed.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeed.cpp new file mode 100644 index 0000000000..2837dafa81 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeed.cpp @@ -0,0 +1,630 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfUAVReadSpeed.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +static const unsigned int NUM_SIZES = 4; +static const unsigned int NUM_READ_MODES = 6; +// Limit to 32 reads for now +static const unsigned int MAX_READ_MODES = 4; + +static const unsigned int NumReads[NUM_READ_MODES] = {1, 4, 16, 32, 64, 128}; +// 256KB, 1 MB, 4MB, 16 MB +static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304, + 16777216}; +static const unsigned int MaxTypes = 6; +static unsigned int NumTypes = MaxTypes; +static const char *types[MaxTypes] = {"char", "short", "int", + "long", "float", "double"}; +static unsigned int StartType = 0; +static const unsigned int NumVecWidths = 5; +static const char *vecWidths[NumVecWidths] = {"", "2", "4", "8", "16"}; +static const unsigned int TypeSize[MaxTypes] = { + sizeof(cl_char), sizeof(cl_short), sizeof(cl_int), + sizeof(cl_long), sizeof(cl_float), sizeof(cl_double)}; +#define CHAR_BUF_SIZE 512 + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif +void OCLPerfUAVReadSpeed::genShader(unsigned int type, unsigned int vecWidth, + unsigned int numReads) { + char buf[CHAR_BUF_SIZE]; + + shader_.clear(); + shader_ += + "#ifdef USE_ARENA\n" + "#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\n" + "#endif\n"; + shader_ += + "#ifdef USE_AMD_DOUBLES\n" + "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n" + "#endif\n"; + shader_ += + "#ifdef USE_KHR_DOUBLES\n" + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" + "#endif\n"; + SNPRINTF(buf, CHAR_BUF_SIZE, + "__kernel void __attribute__((reqd_work_group_size(64,1,1))) " + "_uavReadSpeed(__global %s%s * restrict inBuf, __global %s%s * " + "restrict outBuf, constant uint * restrict constBuf)\n", + types[type], vecWidths[vecWidth], types[type], vecWidths[vecWidth]); + shader_.append(buf); + shader_ += + "{\n" + " uint i = (uint) get_global_id(0);\n"; + if (numReads == 1) { + SNPRINTF(buf, CHAR_BUF_SIZE, " %s%s temp = 0;\n", types[type], + vecWidths[vecWidth]); + shader_.append(buf); + shader_ += + " const unsigned int Max = constBuf[0];\n" + " temp = *(inBuf + i % Max);\n"; + shader_ += + " *(outBuf + i) = temp;\n" + "}\n"; + } else { + SNPRINTF(buf, CHAR_BUF_SIZE, " %s%s temp0 = 0;\n", types[type], + vecWidths[vecWidth]); + shader_.append(buf); + SNPRINTF(buf, CHAR_BUF_SIZE, " %s%s temp1 = 0;\n", types[type], + vecWidths[vecWidth]); + shader_.append(buf); + SNPRINTF(buf, CHAR_BUF_SIZE, " %s%s temp2 = 0;\n", types[type], + vecWidths[vecWidth]); + shader_.append(buf); + SNPRINTF(buf, CHAR_BUF_SIZE, " %s%s temp3 = 0;\n", types[type], + vecWidths[vecWidth]); + shader_.append(buf); + shader_ += + " const unsigned int Max = constBuf[0];\n" + " unsigned int idx0 = (i % Max) + constBuf[1];\n" + " unsigned int idx1 = (i % Max) + constBuf[2];\n" + " unsigned int idx2 = (i % Max) + constBuf[3];\n" + " unsigned int idx3 = (i % Max) + constBuf[4];\n"; + + for (unsigned int i = 0; i < (numReads >> 2); i++) { + shader_ += " temp0 += *(inBuf + idx0);\n"; + shader_ += " temp1 += *(inBuf + idx1);\n"; + shader_ += " temp2 += *(inBuf + idx2);\n"; + shader_ += " temp3 += *(inBuf + idx3);\n"; + shader_ += " idx0 += constBuf[5];\n"; + shader_ += " idx1 += constBuf[5];\n"; + shader_ += " idx2 += constBuf[5];\n"; + shader_ += " idx3 += constBuf[5];\n"; + } + shader_ += + " *(outBuf + i) = temp0 + temp1 + temp2 + temp3;\n" + "}\n"; + } + // printf("shader:\n%s\n", shader_.c_str()); +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +OCLPerfUAVReadSpeed::OCLPerfUAVReadSpeed() { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + context_ = 0; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + // Get last for default + platform = platforms[numPlatforms - 1]; + for (unsigned i = 0; i < numPlatforms; ++i) { + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, + sizeof(pbuf), pbuf, NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = + _wrapper->clGetDeviceIDs(platforms[i], type_, 0, NULL, &num_devices); + // Runtime returns an error when no GPU devices are present instead of + // just returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + platform = platforms[i]; + break; + } + } + delete platforms; + } + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + char charbuf[1024]; + size_t retsize; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024, + charbuf, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + char *p = strstr(charbuf, "cl_khr_byte_addressable_store"); + char *p2 = strstr(charbuf, "cl_khr_fp64"); + char *p3 = strstr(charbuf, "cl_amd_fp64"); + + NumTypes = MaxTypes; + if (!p) { + // No arena ops + NumTypes -= 2; + StartType = 2; + } + if (!p2 && !p3) { + // Doubles not supported + NumTypes--; + } + _numSubTests = NumTypes * NumVecWidths * NUM_SIZES * MAX_READ_MODES * 2; + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } +} + +OCLPerfUAVReadSpeed::~OCLPerfUAVReadSpeed() {} + +// Fill with 1s of appropriate type +void OCLPerfUAVReadSpeed::setData(cl_mem buffer, float val) { + void *ptr = + _wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_WRITE, 0, + bufSize_, 0, NULL, NULL, &error_); + switch (typeIdx_) { + case 0: // char + { + char *data = (char *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(char)); i++) + data[i] = (char)val; + break; + } + case 1: // short + { + short *data = (short *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(short)); i++) + data[i] = (short)val; + break; + } + case 2: // int + { + int *data = (int *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(int)); i++) + data[i] = (int)val; + break; + } + case 3: // long + { + cl_long *data = (cl_long *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(cl_long)); i++) + data[i] = (cl_long)val; + break; + } + case 4: // float + { + float *data = (float *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(float)); i++) + data[i] = val; + break; + } + case 5: // double + { + double *data = (double *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(double)); i++) + data[i] = (double)val; + break; + } + default: + // oops + break; + } + error_ = + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL); +} + +void OCLPerfUAVReadSpeed::checkData(cl_mem buffer) { + void *ptr = + _wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, CL_MAP_READ, 0, + bufSize_, 0, NULL, NULL, &error_); + switch (typeIdx_) { + case 0: // char + { + char *data = (char *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(char)); i++) { + if (data[i] != (char)numReads_) { + printf("Data validation failed at index %d!\n", i); + printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, + numReads_, numReads_, numReads_, (unsigned int)data[i], + (unsigned int)data[i + 1], (unsigned int)data[i + 2], + (unsigned int)data[i + 3]); + CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n"); + break; + } + } + break; + } + case 1: // short + { + short *data = (short *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(short)); i++) { + if (data[i] != (short)numReads_) { + printf("Data validation failed at index %d!\n", i); + printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, + numReads_, numReads_, numReads_, (unsigned int)data[i], + (unsigned int)data[i + 1], (unsigned int)data[i + 2], + (unsigned int)data[i + 3]); + CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n"); + break; + } + } + break; + } + case 2: // int + { + int *data = (int *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(int)); i++) { + if (data[i] != (int)numReads_) { + printf("Data validation failed at index %d!\n", i); + printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, + numReads_, numReads_, numReads_, (unsigned int)data[i], + (unsigned int)data[i + 1], (unsigned int)data[i + 2], + (unsigned int)data[i + 3]); + CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n"); + break; + } + } + break; + } + case 3: // long + { + cl_long *data = (cl_long *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(cl_long)); i++) { + if (data[i] != (cl_long)numReads_) { + printf("Data validation failed at index %d!\n", i); + printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, + numReads_, numReads_, numReads_, (unsigned int)data[i], + (unsigned int)data[i + 1], (unsigned int)data[i + 2], + (unsigned int)data[i + 3]); + CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n"); + break; + } + } + break; + } + case 4: // float + { + float *data = (float *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(float)); i++) { + if (data[i] != (float)numReads_) { + printf("Data validation failed at index %d!\n", i); + printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, + numReads_, numReads_, numReads_, (unsigned int)data[i], + (unsigned int)data[i + 1], (unsigned int)data[i + 2], + (unsigned int)data[i + 3]); + CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n"); + break; + } + } + break; + } + case 5: // double + { + double *data = (double *)ptr; + for (unsigned int i = 0; i < (bufSize_ / sizeof(double)); i++) { + if (data[i] != (double)numReads_) { + printf("Data validation failed at index %d!\n", i); + printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, + numReads_, numReads_, numReads_, (unsigned int)data[i], + (unsigned int)data[i + 1], (unsigned int)data[i + 2], + (unsigned int)data[i + 3]); + CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n"); + break; + } + } + break; + } + default: + // oops + break; + } + error_ = + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, ptr, 0, NULL, NULL); +} + +void OCLPerfUAVReadSpeed::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + + context_ = 0; + cmd_queue_ = 0; + program_ = 0; + kernel_ = 0; + inBuffer_ = 0; + outBuffer_ = 0; + constBuffer_ = 0; + isAMD = false; + _errorFlag = false; // Reset error code so a single error doesn't prevent + // other subtests from running + _errorMsg = ""; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + isAMD = true; + } + // platform = platforms[_platformIndex]; + // break; + } +#if 0 + } +#endif + delete platforms; + } + + numReads_ = NumReads[test % MAX_READ_MODES]; + width_ = Sizes[(test / MAX_READ_MODES) % NUM_SIZES]; + vecSizeIdx_ = (test / (MAX_READ_MODES * NUM_SIZES)) % NumVecWidths; + typeIdx_ = (test / (MAX_READ_MODES * NUM_SIZES * NumVecWidths)) % NumTypes + + StartType; + cached_ = (test >= (MAX_READ_MODES * NUM_SIZES * NumTypes * NumVecWidths)); + + bufSize_ = width_; + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + device = devices[0]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + inBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed"); + + outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + constBuffer_ = _wrapper->clCreateBuffer(context_, 0, 16 * 2, NULL, &error_); + CHECK_RESULT(constBuffer_ == 0, "clCreateBuffer(constBuffer) failed"); + + genShader(typeIdx_, vecSizeIdx_, numReads_); + char *tmp = (char *)shader_.c_str(); + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&tmp, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + std::string args; + args.clear(); + if (cached_ && isAMD) { + args = "-fno-alias "; + } + if (typeIdx_ < 2) { + args += "-D USE_ARENA "; + } + + if (typeIdx_ == 5) { + if (isAMD) { + args += "-D USE_AMD_DOUBLES "; + } else { + args += "-D USE_KHR_DOUBLES "; + } + } +#if 0 + // This setting can dramatically boost the long16 perf results by avoiding spilling. + if (isAMD) + args += "-Wb,-pre-RA-sched=list-tdrr"; +#endif + + error_ = + _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL); + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "_uavReadSpeed", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&inBuffer_); + error_ = + _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_); + error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_mem), + (void *)&constBuffer_); + + setData(inBuffer_, 1.0f); + setData(outBuffer_, 1.2345678f); + unsigned int *cBuf = (unsigned int *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, constBuffer_, true, CL_MAP_WRITE, 0, 16 * 2, 0, NULL, NULL, + &error_); + // Force all wavefronts to fetch the same data. We are looking for peak speed + // here. + cBuf[0] = 64; + // These values are chosen to assure there is no data reuse within a clause. + // If caching is not working, then the uncached numbers will be low. + cBuf[1] = 0; + cBuf[2] = 64; + cBuf[3] = 128; + cBuf[4] = 192; + cBuf[5] = 0; + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, constBuffer_, cBuf, 0, + NULL, NULL); + _wrapper->clFinish(cmd_queue_); +} + +void OCLPerfUAVReadSpeed::run(void) { + int global = bufSize_ / (TypeSize[typeIdx_] * (1 << vecSizeIdx_)); + int local = 64; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + + CPerfCounter timer; + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < NUM_ITER; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + } + _wrapper->clFinish(cmd_queue_); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Constant bandwidth in GB/s + double perf = + ((double)bufSize_ * numReads_ * NUM_ITER * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; + char buf[256]; + char buf2[256]; + SNPRINTF(buf, sizeof(buf), "%s%s", types[typeIdx_], vecWidths[vecSizeIdx_]); + SNPRINTF(buf2, sizeof(buf2), " %-8s (%8d) %2d reads: %-8s (GB/s) ", buf, + width_, numReads_, (cached_ ? "cached" : "uncached")); + testDescString = buf2; + checkData(outBuffer_); +} + +unsigned int OCLPerfUAVReadSpeed::close(void) { + _wrapper->clFinish(cmd_queue_); + + if (inBuffer_) { + error_ = _wrapper->clReleaseMemObject(inBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(inBuffer_) failed"); + } + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (constBuffer_) { + error_ = _wrapper->clReleaseMemObject(constBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(constBuffer_) failed"); + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeed.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeed.h new file mode 100644 index 0000000000..b779e7d6e8 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeed.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_UAVReadSpeed_H_ +#define _OCL_UAVReadSpeed_H_ + +#include "OCLTestImp.h" + +class OCLPerfUAVReadSpeed : public OCLTestImp { + public: + OCLPerfUAVReadSpeed(); + virtual ~OCLPerfUAVReadSpeed(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void genShader(unsigned int type, unsigned int vecWidth, + unsigned int numReads); + void setData(cl_mem buffer, float data); + void checkData(cl_mem buffer); + + static const unsigned int NUM_ITER = 100; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + cl_mem inBuffer_; + cl_mem outBuffer_; + cl_mem constBuffer_; + cl_int error_; + + unsigned int width_; + unsigned int bufSize_; + unsigned int vecSizeIdx_; + unsigned int numReads_; + unsigned int typeIdx_; + bool cached_; + bool isAMD; +}; + +#endif // _OCL_UAVReadSpeed_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeedHostMem.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeedHostMem.cpp new file mode 100644 index 0000000000..24f736ac3c --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeedHostMem.cpp @@ -0,0 +1,437 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfUAVReadSpeedHostMem.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +const unsigned int NUM_SIZES = 4; +const unsigned int NUM_READ_MODES = 1; +const unsigned int MAX_READ_MODES = 1; + +static const unsigned int NumReads[NUM_READ_MODES] = {1}; +// 256KB, 1 MB, 4MB, 16 MB and 64 MB +static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304, + 16777216}; +static const unsigned int MaxTypes = 2; +static unsigned int NumTypes = MaxTypes; +static const char *types[MaxTypes] = {"float", "double"}; +static const unsigned int TypeSize[MaxTypes] = {sizeof(cl_float), + sizeof(cl_double)}; +static const unsigned int NumVecWidths = 5; +static const char *vecWidths[NumVecWidths] = {"", "2", "4", "8", "16"}; +#define CHAR_BUF_SIZE 512 + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif +void OCLPerfUAVReadSpeedHostMem::genShader(unsigned int type, + unsigned int vecWidth, + unsigned int numReads) { + char buf[CHAR_BUF_SIZE]; + + shader_.clear(); + shader_ += + "#ifdef USE_AMD_DOUBLES\n" + "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n" + "#endif\n"; + shader_ += + "#ifdef USE_KHR_DOUBLES\n" + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" + "#endif\n"; + SNPRINTF(buf, CHAR_BUF_SIZE, + "__kernel void _uavReadSpeedHostMem(__global %s%s *inBuf, __global " + "%s%s *outBuf, constant uint *constBuf)\n", + types[type], vecWidths[vecWidth], types[type], vecWidths[vecWidth]); + shader_.append(buf); + shader_ += + "{\n" + " int i = (int) get_global_id(0);\n"; + SNPRINTF(buf, CHAR_BUF_SIZE, " %s%s temp = 0;\n", types[type], + vecWidths[vecWidth]); + shader_.append(buf); + shader_ += " temp = *(inBuf + i);\n"; + if (vecWidth == 0) { + shader_ += + " if (temp < 0)\n" + " *(outBuf + i) = temp;\n" + "}\n"; + } else { + shader_ += + " if (temp.s0 < 0)\n" + " *(outBuf + i) = temp;\n" + "}\n"; + } + // printf("shader:\n%s\n", shader_.c_str()); +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +OCLPerfUAVReadSpeedHostMem::OCLPerfUAVReadSpeedHostMem() { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + context_ = 0; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + // Get last for default + platform = platforms[numPlatforms - 1]; + for (unsigned i = 0; i < numPlatforms; ++i) { + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, + sizeof(pbuf), pbuf, NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = + _wrapper->clGetDeviceIDs(platforms[i], type_, 0, NULL, &num_devices); + // Runtime returns an error when no GPU devices are present instead of + // just returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + platform = platforms[i]; + break; + } + } + delete platforms; + } + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + char charbuf[1024]; + size_t retsize; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024, + charbuf, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + char *p = strstr(charbuf, "cl_khr_fp64"); + char *p2 = strstr(charbuf, "cl_amd_fp64"); + + NumTypes = MaxTypes; + + if (!p && !p2) { + // Doubles not supported + NumTypes--; + } + _numSubTests = NumTypes * NumVecWidths * NUM_SIZES * MAX_READ_MODES; + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } +} + +OCLPerfUAVReadSpeedHostMem::~OCLPerfUAVReadSpeedHostMem() {} + +void OCLPerfUAVReadSpeedHostMem::setData(cl_mem buffer, float val) { + float *data = (float *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, + CL_MAP_WRITE, 0, bufSize_, + 0, NULL, NULL, &error_); + for (unsigned int i = 0; i < (bufSize_ >> 2); i++) data[i] = val; + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); +} + +void OCLPerfUAVReadSpeedHostMem::checkData(cl_mem buffer) { + float *data = (float *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, + CL_MAP_READ, 0, bufSize_, + 0, NULL, NULL, &error_); + for (unsigned int i = 0; i < (bufSize_ >> 2); i++) { + if (data[i] != (float)numReads_) { + printf("Data validation failed at index %d!\n", i); + printf("Expected %d %d %d %d\nGot %d %d %d %d\n", numReads_, numReads_, + numReads_, numReads_, (unsigned int)data[i], + (unsigned int)data[i + 1], (unsigned int)data[i + 2], + (unsigned int)data[i + 3]); + CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n"); + break; + } + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); +} + +void OCLPerfUAVReadSpeedHostMem::open(unsigned int test, char *units, + double &conversion, + unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + + context_ = 0; + cmd_queue_ = 0; + program_ = 0; + kernel_ = 0; + inBuffer_ = 0; + outBuffer_ = 0; + constBuffer_ = 0; + isAMD = false; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + isAMD = true; + } + // platform = platforms[_platformIndex]; + // break; + } +#if 0 + } +#endif + delete platforms; + } + + numReads_ = NumReads[test % MAX_READ_MODES]; + width_ = Sizes[(test / MAX_READ_MODES) % NUM_SIZES]; + vecSizeIdx_ = (test / (MAX_READ_MODES * NUM_SIZES)) % NumVecWidths; + typeIdx_ = (test / (MAX_READ_MODES * NUM_SIZES * NumVecWidths)) % NumTypes; + cached_ = true; + + bufSize_ = width_; + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + device = devices[0]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + inBuffer_ = _wrapper->clCreateBuffer(context_, + CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, + bufSize_, NULL, &error_); + CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed"); + + outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + constBuffer_ = _wrapper->clCreateBuffer(context_, 0, 16 * 2, NULL, &error_); + CHECK_RESULT(constBuffer_ == 0, "clCreateBuffer(constBuffer) failed"); + + genShader(typeIdx_, vecSizeIdx_, numReads_); + char *tmp = (char *)shader_.c_str(); + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&tmp, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + std::string args; + args.clear(); + if (cached_ && isAMD) { + args = "-fno-alias "; + } + if (typeIdx_ == 1) { + if (isAMD) { + args += "-D USE_AMD_DOUBLES "; + } else { + args += "-D USE_KHR_DOUBLES "; + } + } + error_ = + _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL); + + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "_uavReadSpeedHostMem", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&inBuffer_); + error_ = + _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_); + error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_mem), + (void *)&constBuffer_); + + setData(inBuffer_, 0.0f); + setData(outBuffer_, 1.2345678f); + unsigned int *cBuf = (unsigned int *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, constBuffer_, true, CL_MAP_WRITE, 0, 16 * 2, 0, NULL, NULL, + &error_); + cBuf[0] = bufSize_ / (TypeSize[typeIdx_] * (1 << vecSizeIdx_)); + cBuf[1] = 0; + cBuf[2] = 1024; + cBuf[3] = 2048; + cBuf[4] = 3072; + cBuf[5] = 0; + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, constBuffer_, cBuf, 0, + NULL, NULL); + _wrapper->clFinish(cmd_queue_); +} + +void OCLPerfUAVReadSpeedHostMem::run(void) { + int global = bufSize_ / (TypeSize[typeIdx_] * (1 << vecSizeIdx_)); + int local = 64; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + + CPerfCounter timer; + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < NUM_ITER; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + } + _wrapper->clFinish(cmd_queue_); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Constant bandwidth in GB/s + double perf = + ((double)bufSize_ * numReads_ * NUM_ITER * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; + char buf[256]; + char buf2[256]; + SNPRINTF(buf, sizeof(buf), "%s%s", types[typeIdx_], vecWidths[vecSizeIdx_]); + SNPRINTF(buf2, sizeof(buf2), " %-8s (%8d) (GB/s) ", buf, width_); + testDescString = buf2; + // Test doesn't write anything + // checkData(outBuffer_); +} + +unsigned int OCLPerfUAVReadSpeedHostMem::close(void) { + _wrapper->clFinish(cmd_queue_); + + if (inBuffer_) { + error_ = _wrapper->clReleaseMemObject(inBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(inBuffer_) failed"); + } + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (constBuffer_) { + error_ = _wrapper->clReleaseMemObject(constBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(constBuffer_) failed"); + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeedHostMem.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeedHostMem.h new file mode 100644 index 0000000000..20f2393313 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVReadSpeedHostMem.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_UAVReadSpeedHostMem_H_ +#define _OCL_UAVReadSpeedHostMem_H_ + +#include "OCLTestImp.h" + +class OCLPerfUAVReadSpeedHostMem : public OCLTestImp { + public: + OCLPerfUAVReadSpeedHostMem(); + virtual ~OCLPerfUAVReadSpeedHostMem(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void genShader(unsigned int type, unsigned int vecWidth, + unsigned int numReads); + void setData(cl_mem buffer, float data); + void checkData(cl_mem buffer); + + static const unsigned int NUM_ITER = 100; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + cl_mem inBuffer_; + cl_mem outBuffer_; + cl_mem constBuffer_; + cl_int error_; + + unsigned int width_; + unsigned int bufSize_; + unsigned int vecSizeIdx_; + unsigned int numReads_; + unsigned int typeIdx_; + bool isAMD; + bool cached_; +}; + +#endif // _OCL_UAVReadSpeedHostMem_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVWriteSpeedHostMem.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVWriteSpeedHostMem.cpp new file mode 100644 index 0000000000..446b0c3c44 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVWriteSpeedHostMem.cpp @@ -0,0 +1,380 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfUAVWriteSpeedHostMem.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +const unsigned int NUM_SIZES = 4; + +// 256KB, 1 MB, 4MB, 16 MB and 64 MB +static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304, + 16777216}; +static const unsigned int MaxTypes = 2; +static unsigned int NumTypes = 2; +static const char *types[MaxTypes] = {"float", "double"}; +static const unsigned int TypeSize[MaxTypes] = {sizeof(cl_float), + sizeof(cl_double)}; +static const unsigned int NumVecWidths = 5; +static const char *vecWidths[NumVecWidths] = {"", "2", "4", "8", "16"}; +#define CHAR_BUF_SIZE 512 + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif +void OCLPerfUAVWriteSpeedHostMem::genShader(unsigned int type, + unsigned int vecWidth) { + char buf[CHAR_BUF_SIZE]; + + shader_.clear(); + shader_ += + "#ifdef USE_AMD_DOUBLES\n" + "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n" + "#endif\n"; + shader_ += + "#ifdef USE_KHR_DOUBLES\n" + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" + "#endif\n"; + SNPRINTF(buf, CHAR_BUF_SIZE, + "__kernel void _uavWriteSpeedHostMem(__global %s%s *outBuf)\n", + types[type], vecWidths[vecWidth]); + shader_.append(buf); + shader_ += + "{\n" + " int i = (int) get_global_id(0);\n" + " *(outBuf + i) = 0;\n" + "}\n"; + // printf("shader:\n%s\n", shader_.c_str()); +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +OCLPerfUAVWriteSpeedHostMem::OCLPerfUAVWriteSpeedHostMem() { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + context_ = 0; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + // Get last for default + platform = platforms[numPlatforms - 1]; + for (unsigned i = 0; i < numPlatforms; ++i) { + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, + sizeof(pbuf), pbuf, NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = + _wrapper->clGetDeviceIDs(platforms[i], type_, 0, NULL, &num_devices); + // Runtime returns an error when no GPU devices are present instead of + // just returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + platform = platforms[i]; + break; + } + } + delete platforms; + } + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + char charbuf[1024]; + size_t retsize; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024, + charbuf, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + char *p = strstr(charbuf, "cl_khr_fp64"); + char *p2 = strstr(charbuf, "cl_amd_fp64"); + + NumTypes = MaxTypes; + + if (!p && !p2) { + // Doubles not supported + NumTypes--; + } + _numSubTests = NumTypes * NumVecWidths * NUM_SIZES; + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } +} + +OCLPerfUAVWriteSpeedHostMem::~OCLPerfUAVWriteSpeedHostMem() {} + +void OCLPerfUAVWriteSpeedHostMem::setData(cl_mem buffer, float val) { + float *data = (float *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, + CL_MAP_WRITE, 0, bufSize_, + 0, NULL, NULL, &error_); + for (unsigned int i = 0; i < (bufSize_ >> 2); i++) data[i] = val; + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); + _wrapper->clFinish(cmd_queue_); +} + +void OCLPerfUAVWriteSpeedHostMem::checkData(cl_mem buffer) { + float *data = (float *)_wrapper->clEnqueueMapBuffer(cmd_queue_, buffer, true, + CL_MAP_READ, 0, bufSize_, + 0, NULL, NULL, &error_); + for (unsigned int i = 0; i < (bufSize_ >> 2); i++) { + if (data[i] != 0.0f) { + printf("Data validation failed at index %d!\n", i); + printf("Expected %lf %lf %lf %lf\nGot %d %d %d %d\n", 0.0f, 0.0f, 0.0f, + 0.0f, (unsigned int)data[i], (unsigned int)data[i + 1], + (unsigned int)data[i + 2], (unsigned int)data[i + 3]); + CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n"); + break; + } + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); + _wrapper->clFinish(cmd_queue_); +} + +void OCLPerfUAVWriteSpeedHostMem::open(unsigned int test, char *units, + double &conversion, + unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + + context_ = 0; + cmd_queue_ = 0; + program_ = 0; + kernel_ = 0; + outBuffer_ = 0; + isAMD = false; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + isAMD = true; + } + // platform = platforms[_platformIndex]; + // break; + } +#if 0 + } +#endif + delete platforms; + } + + width_ = Sizes[test % NUM_SIZES]; + vecSizeIdx_ = (test / NUM_SIZES) % NumVecWidths; + typeIdx_ = (test / (NUM_SIZES * NumVecWidths)) % NumTypes; + + bufSize_ = width_; + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + device = devices[0]; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + outBuffer_ = _wrapper->clCreateBuffer( + context_, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, bufSize_, NULL, + &error_); + CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed"); + + genShader(typeIdx_, vecSizeIdx_); + char *tmp = (char *)shader_.c_str(); + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&tmp, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + std::string args; + args.clear(); + if (typeIdx_ == 1) { + if (isAMD) { + args += "-D USE_AMD_DOUBLES "; + } else { + args += "-D USE_KHR_DOUBLES "; + } + } + error_ = + _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL); + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = + _wrapper->clCreateKernel(program_, "_uavWriteSpeedHostMem", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer_); + + setData(outBuffer_, 1.2345678f); +} + +void OCLPerfUAVWriteSpeedHostMem::run(void) { + int global = bufSize_ / (TypeSize[typeIdx_] * (1 << vecSizeIdx_)); + int local = 64; + + size_t global_work_size[1] = {(size_t)global}; + size_t local_work_size[1] = {(size_t)local}; + + CPerfCounter timer; + + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < NUM_ITER; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + } + _wrapper->clFinish(cmd_queue_); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + // Constant bandwidth in GB/s + double perf = ((double)bufSize_ * NUM_ITER * (double)(1e-09)) / sec; + + _perfInfo = (float)perf; + char buf[256]; + char buf2[256]; + SNPRINTF(buf, sizeof(buf), "%s%s", types[typeIdx_], vecWidths[vecSizeIdx_]); + SNPRINTF(buf2, sizeof(buf2), " %-8s (%8d) (GB/s) ", buf, width_); + testDescString = buf2; + + // Test just writes 0s + checkData(outBuffer_); +} + +unsigned int OCLPerfUAVWriteSpeedHostMem::close(void) { + _wrapper->clFinish(cmd_queue_); + + if (outBuffer_) { + error_ = _wrapper->clReleaseMemObject(outBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVWriteSpeedHostMem.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVWriteSpeedHostMem.h new file mode 100644 index 0000000000..646f74ed0f --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUAVWriteSpeedHostMem.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_UAVWriteSpeedHostMem_H_ +#define _OCL_UAVWriteSpeedHostMem_H_ + +#include "OCLTestImp.h" + +class OCLPerfUAVWriteSpeedHostMem : public OCLTestImp { + public: + OCLPerfUAVWriteSpeedHostMem(); + virtual ~OCLPerfUAVWriteSpeedHostMem(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + std::string shader_; + void genShader(unsigned int type, unsigned int vecWidth); + void setData(cl_mem buffer, float data); + void checkData(cl_mem buffer); + + static const unsigned int NUM_ITER = 100; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + cl_mem outBuffer_; + cl_int error_; + + unsigned int width_; + unsigned int bufSize_; + unsigned int vecSizeIdx_; + unsigned int typeIdx_; + bool isAMD; +}; + +#endif // _OCL_UAVWriteSpeedHostMem_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUncoalescedRead.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUncoalescedRead.cpp new file mode 100644 index 0000000000..b9add8e915 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUncoalescedRead.cpp @@ -0,0 +1,270 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfUncoalescedRead.h" + +#include + +#include +#include + +#include "Timer.h" + +const char* OCLPerfUncoalescedRead::kernel_str = + "#define NUM_READS 32\n\ + __kernel void read_uncoalescing(__global float *input,__global float *output)\n\ + {\n\ + float val = (float)(0.0f);\n\ + size_t gid = get_global_id(0);\n\ + val = val + input[gid * NUM_READS + 0];\n\ + val = val + input[gid * NUM_READS + 1];\n\ + val = val + input[gid * NUM_READS + 2];\n\ + val = val + input[gid * NUM_READS + 3];\n\ + val = val + input[gid * NUM_READS + 4];\n\ + val = val + input[gid * NUM_READS + 5];\n\ + val = val + input[gid * NUM_READS + 6];\n\ + val = val + input[gid * NUM_READS + 7];\n\ + val = val + input[gid * NUM_READS + 8];\n\ + val = val + input[gid * NUM_READS + 9];\n\ + val = val + input[gid * NUM_READS + 10];\n\ + val = val + input[gid * NUM_READS + 11];\n\ + val = val + input[gid * NUM_READS + 12];\n\ + val = val + input[gid * NUM_READS + 13];\n\ + val = val + input[gid * NUM_READS + 14];\n\ + val = val + input[gid * NUM_READS + 15];\n\ + val = val + input[gid * NUM_READS + 16];\n\ + val = val + input[gid * NUM_READS + 17];\n\ + val = val + input[gid * NUM_READS + 18];\n\ + val = val + input[gid * NUM_READS + 19];\n\ + val = val + input[gid * NUM_READS + 20];\n\ + val = val + input[gid * NUM_READS + 21];\n\ + val = val + input[gid * NUM_READS + 22];\n\ + val = val + input[gid * NUM_READS + 23];\n\ + val = val + input[gid * NUM_READS + 24];\n\ + val = val + input[gid * NUM_READS + 25];\n\ + val = val + input[gid * NUM_READS + 26];\n\ + val = val + input[gid * NUM_READS + 27];\n\ + val = val + input[gid * NUM_READS + 28];\n\ + val = val + input[gid * NUM_READS + 29];\n\ + val = val + input[gid * NUM_READS + 30];\n\ + val = val + input[gid * NUM_READS + 31];\n\ + output[gid] = val;\n\ + }\n"; + +OCLPerfUncoalescedRead::OCLPerfUncoalescedRead() { _numSubTests = 3; } + +OCLPerfUncoalescedRead::~OCLPerfUncoalescedRead() {} + +void OCLPerfUncoalescedRead::open(unsigned int test, char* units, + double& conversion, unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "error_ opening test"); + silentFailure = false; + _openTest = test; + program_ = 0; + kernel_ = 0; + input_buff = NULL; + + if (test > 0) { + size_t param_size = 0; + char* strVersion = 0; + error_ = _wrapper->clGetDeviceInfo( + devices_[_deviceId], CL_DEVICE_OPENCL_C_VERSION, 0, 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed"); + strVersion = (char*)malloc(param_size); + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], + CL_DEVICE_OPENCL_C_VERSION, param_size, + strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed"); + if (strVersion[9] < '2') { + printf("\nOpenCL C 2.0 not supported\n"); + silentFailure = true; + } + free(strVersion); + if (silentFailure) return; + } + + cl_mem buffer = + _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, + SIZE * NUM_READS * sizeof(cl_float), 0, &error_); + buffers_.push_back(buffer); + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, + SIZE * sizeof(cl_float), 0, &error_); + buffers_.push_back(buffer); + + srand(0x8956); + input_buff = (float*)malloc(SIZE * NUM_READS * sizeof(float)); + for (unsigned int i = 0; i < SIZE * NUM_READS; ++i) { + input_buff[i] = (float)rand(); + } + + error_ = _wrapper->clEnqueueWriteBuffer( + cmdQueues_[_deviceId], buffers_[0], CL_TRUE, 0, + SIZE * NUM_READS * sizeof(cl_float), input_buff, 0, 0, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed"); + + float* buff = (float*)_wrapper->clEnqueueMapBuffer( + cmdQueues_[_deviceId], buffers_[1], CL_TRUE, CL_MAP_WRITE, 0, + SIZE * sizeof(cl_float), 0, 0, 0, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueMapBuffer failed"); + memset(buff, 0, SIZE * sizeof(cl_float)); + error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], buffers_[1], + buff, 0, 0, 0); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueMapBuffer failed"); + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed"); + std::string compileOptions = ""; + if (test > 0) { + compileOptions = "-cl-std=CL2.0"; + } + if (test > 1) { + compileOptions += " -fsc-use-buffer-for-hsa-global "; + } + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], + compileOptions.c_str(), NULL, NULL); + + if (error_ != CL_SUCCESS) { + char log[400]; + _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, 400, log, 0); + printf("\n\n%s\n\n", log); + } + + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed"); + kernel_ = _wrapper->clCreateKernel(program_, "read_uncoalescing", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel failed"); + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*)&buffers_[0]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed"); + error_ = + _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void*)&buffers_[1]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed"); +} + +void OCLPerfUncoalescedRead::validate(void) { + bool success = true; + float* buff = (float*)_wrapper->clEnqueueMapBuffer( + cmdQueues_[_deviceId], buffers_[1], CL_TRUE, CL_MAP_READ, 0, + SIZE * sizeof(cl_float), 0, 0, 0, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueMapBuffer failed"); + for (unsigned int i = 0; i < SIZE; ++i) { + volatile float val = 0; + for (int j = 0; j < NUM_READS; ++j) { + val += input_buff[i * NUM_READS + j]; + } + if (val != buff[i]) { + success = false; + std::string errorMsg = "Invalid result. Expected: "; + errorMsg += std::to_string(val); + errorMsg += " Actual result: "; + errorMsg += std::to_string(buff[i]); + CHECK_RESULT(true, errorMsg.c_str()); + break; + } + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], buffers_[1], + buff, 0, 0, 0); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueMapBuffer failed"); +} + +void OCLPerfUncoalescedRead::run(void) { + if (silentFailure) { + return; + } + CPerfCounter timer; + + // Warm up + size_t workGroupSize = SIZE; + for (int i = 0; i < 50; ++i) { + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, &workGroupSize, NULL, 0, + NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + } + + cl_event eventArr[NUM_ITER]; + timer.Reset(); + timer.Start(); + for (unsigned int i = 0; i < NUM_ITER; i++) { + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, &workGroupSize, NULL, 0, + NULL, &eventArr[i]); + + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel"); + } + error_ = _wrapper->clFinish(cmdQueues_[_deviceId]); + CHECK_RESULT(error_, "clFinish failed"); + timer.Stop(); + double sec1 = timer.GetElapsedTime(); + double sec2 = 0; + for (unsigned int i = 0; i < NUM_ITER; ++i) { + cl_ulong startTime = 0, endTime = 0; + error_ = _wrapper->clGetEventProfilingInfo(eventArr[i], + CL_PROFILING_COMMAND_START, + sizeof(cl_ulong), &startTime, 0); + CHECK_RESULT(error_, "clGetEventProfilingInfo failed"); + error_ = _wrapper->clGetEventProfilingInfo( + eventArr[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0); + CHECK_RESULT(error_, "clGetEventProfilingInfo failed"); + sec2 += 1e-9 * (endTime - startTime); + error_ = _wrapper->clReleaseEvent(eventArr[i]); + CHECK_RESULT(error_, "clReleaseEvent failed"); + } + + validate(); + + // Buffer copy bandwidth in GB/s + double perf1 = ((double)SIZE * NUM_READS * NUM_ITER * sizeof(cl_float) * + (double)(1e-09)) / + sec1; + double perf2 = ((double)SIZE * NUM_READS * NUM_ITER * sizeof(cl_float) * + (double)(1e-09)) / + sec2; + _perfInfo = (float)perf2; + + std::ostringstream strStream; + switch (_openTest) { + case 0: + strStream << "OCL1.2 "; + break; + case 1: + strStream << "OCL2.0 "; + break; + case 2: + strStream << "OCL2.0/flag "; + break; + } + + strStream << std::fixed << std::setprecision(2) << perf1 << " timer GB/s "; + strStream << "time: " << std::setprecision(3) << sec1 << "s (profile GB/s)"; + testDescString = strStream.str(); + ; +} + +unsigned int OCLPerfUncoalescedRead::close(void) { + if (input_buff) { + free(input_buff); + } + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUncoalescedRead.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUncoalescedRead.h new file mode 100644 index 0000000000..b9e1ffde1d --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfUncoalescedRead.h @@ -0,0 +1,44 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_UncoalescedRead_H_ +#define _OCL_UncoalescedRead_H_ + +#include "OCLTestImp.h" +#define NUM_READS 32 +class OCLPerfUncoalescedRead : public OCLTestImp { + public: + OCLPerfUncoalescedRead(); + virtual ~OCLPerfUncoalescedRead(); + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + static const unsigned int NUM_ITER = 1000; + static const unsigned int SIZE = 250000; + static const char* kernel_str; + bool silentFailure; + float* input_buff; + void validate(void); +}; + +#endif // _OCL_UncoalescedRead_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfVerticalFetch.cpp b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfVerticalFetch.cpp new file mode 100644 index 0000000000..41d17ad7f5 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfVerticalFetch.cpp @@ -0,0 +1,353 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfVerticalFetch.h" + +#include +#include +#include + +#include +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 1 +#define WIDTH 4952 +#define HEIGHT 3288 +unsigned int Sizes[NUM_SIZES] = {WIDTH * HEIGHT * 4}; + +#define KERNEL_CODE(...) #__VA_ARGS__ +const static char* strKernel = KERNEL_CODE( +\n __kernel void ResizeVerticalFilter( + const __global uint* inputImage, const unsigned int inputColumns, + const unsigned int inputRows, __local uint* inputImageCache, + const int numCachedPixels, __global uint* dst) { + const unsigned int startY = get_group_id(1) * get_local_size(1); + float scale = 0.5f; + const float support = 0.5f; + const int cacheRangeStartY = + max((int)((startY + 0.5f) / 1.0f + support + 0.5f), (int)(0)); + const int cacheRangeEndY = + min((int)(cacheRangeStartY + numCachedPixels), (int)inputRows); + const unsigned int x = get_global_id(0); + event_t e = async_work_group_strided_copy( + inputImageCache, inputImage + cacheRangeStartY * inputColumns + x, + cacheRangeEndY - cacheRangeStartY, inputColumns, 0); + wait_group_events(1, &e); + + if (get_local_id(1) == 0) { + // uint sum = 0; + // for (unsigned int chunk = 0; chunk < numCachedPixels; chunk++) { + // sum += inputImageCache[chunk]; + // } + atomic_add(dst, inputImageCache[0]); + } +} +\n); + +OCLPerfVerticalFetch::OCLPerfVerticalFetch() { + ptr_ = nullptr; + _numSubTests = 6; +} + +OCLPerfVerticalFetch::~OCLPerfVerticalFetch() {} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLPerfVerticalFetch::open(unsigned int test, char* units, + double& conversion, unsigned int deviceId) { + error_ = CL_SUCCESS; + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + program_ = 0; + kernel_ = 0; + skip_ = false; + dstBuffer_ = 0; + cl_ulong loopCnt = nBytes / (16 * sizeof(cl_uint)); + cl_uint maxCUs; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], + CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(cl_uint), &maxCUs, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + wgs = 64; + const static cl_uint wavesPerCU = 8; + nWorkItems = maxCUs * wavesPerCU * wgs; + uint32_t memLoc = CL_MEM_USE_HOST_PTR; + + inputData = 0x1; + switch (test) { + case 0: + nIter = 1; + mem_type_ = "UHP"; + break; + case 1: + nIter = 100; + mem_type_ = "UHP"; + break; + case 2: + nIter = 1; + memLoc = CL_MEM_ALLOC_HOST_PTR; + mem_type_ = "AHP"; + break; + case 3: + nIter = 100; + memLoc = CL_MEM_ALLOC_HOST_PTR; + mem_type_ = "AHP"; + break; + case 4: + nIter = 1; + memLoc = 0; + mem_type_ = "dev"; + break; + case 5: + nIter = 1000; + memLoc = 0; + mem_type_ = "dev"; + break; + } + + std::string nameFile("dim.ini"); + std::fstream is(nameFile.c_str(), std::fstream::in | std::fstream::binary); + std::string line; + if (is.is_open()) { + size_t posStart = 0; + do { + std::getline(is, line); + } while (line.find_first_of('/', posStart) != std::string::npos); + // Find global/local + posStart = 0; + size_t posEnd = 1; + std::string dimS = line.substr(posStart, posEnd - posStart); + dim = std::stoi(dimS.c_str(), nullptr, 10); + posStart = posEnd; + posEnd = line.find_first_of('[', posStart); + for (cl_uint i = 0; i < dim; ++i) { + posStart = posEnd + 1; + posEnd = line.find_first_of(',', posStart); + std::string global = line.substr(posStart, posEnd - posStart); + gws[i] = std::stoi(global.c_str(), nullptr, 10); + } + posEnd = line.find_first_of('[', posStart); + for (cl_uint i = 0; i < dim; ++i) { + posStart = posEnd + 1; + posEnd = line.find_first_of(',', posStart); + std::string global = line.substr(posStart, posEnd - posStart); + lws[i] = std::stoi(global.c_str(), nullptr, 10); + } + posEnd = line.find_first_of('[', posStart); + posStart = posEnd + 1; + posEnd = line.find_first_of(',', posStart); + std::string global = line.substr(posStart, posEnd - posStart); + numCachedPixels_ = std::stoi(global.c_str(), nullptr, 10); + is.close(); + } else { + dim = 2; + gws[0] = WIDTH; + gws[1] = 512; + lws[0] = 1; + lws[1] = 256; + numCachedPixels_ = 1676; + } + cl_uint width = static_cast(gws[0]); + cl_uint height = numCachedPixels_ * static_cast(gws[1] / lws[1]); + if (gws[1] > 512) { + gws[1] = 512; + } + Sizes[0] = width * height * sizeof(int); + nBytes = Sizes[0]; + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "ResizeVerticalFilter", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + if (memLoc == CL_MEM_USE_HOST_PTR) { + ptr_ = malloc(nBytes); + } + srcBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY | memLoc, + nBytes, ptr_, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer(srcBuffer) failed"); + void* mem; + mem = _wrapper->clEnqueueMapBuffer(cmdQueues_[_deviceId], srcBuffer_, CL_TRUE, + CL_MAP_READ | CL_MAP_WRITE, 0, nBytes, 0, + NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + for (unsigned int i = 0; i < nBytes / sizeof(cl_uint); ++i) { + reinterpret_cast(mem)[i] = inputData; + } + + dstBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, + sizeof(cl_uint), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer(dstBuffer) failed"); + _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], srcBuffer_, mem, 0, + NULL, NULL); + mem = _wrapper->clEnqueueMapBuffer(cmdQueues_[_deviceId], dstBuffer_, CL_TRUE, + CL_MAP_READ | CL_MAP_WRITE, 0, + sizeof(cl_uint), 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + memset(mem, 0, sizeof(cl_uint)); + _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], dstBuffer_, mem, 0, + NULL, NULL); + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &srcBuffer_); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_uint), (void*)&width); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = + _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint), (void*)&height); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 3, + numCachedPixels_ * sizeof(cl_uint), 0); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_uint), + (void*)&numCachedPixels_); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = + _wrapper->clSetKernelArg(kernel_, 5, sizeof(cl_mem), (void*)&dstBuffer_); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); +} + +void OCLPerfVerticalFetch::run(void) { + if (skip_) { + return; + } + + CPerfCounter timer; + + // warm up + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, dim, + NULL, gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + + cl_uint* memResult; + memResult = (cl_uint*)malloc(sizeof(cl_uint)); + if (0 == memResult) { + CHECK_RESULT_NO_RETURN(0, "malloc failed!\n"); + return; + } + + memset(memResult, 0, sizeof(cl_uint)); + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], dstBuffer_, + CL_FALSE, 0, sizeof(cl_uint), + memResult, 0, NULL, NULL); + + CHECK_RESULT(error_, "clEnqueueReadBuffer dstBuffer_ failed!"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + + if (memResult[0] != ((gws[0] * gws[1]) / (lws[0] * lws[1]))) { + CHECK_RESULT_NO_RETURN(0, "Data validation failed for warm up run!\n"); + // free(memResult); + // return; + } + + free(memResult); + + timer.Reset(); + timer.Start(); + double sec2 = 0; + cl_event* events = new cl_event[nIter]; + for (unsigned int i = 0; i < nIter; i++) { + error_ = + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, dim, + NULL, gws, lws, 0, NULL, &events[i]); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + } + _wrapper->clFinish(cmdQueues_[_deviceId]); + timer.Stop(); + for (unsigned int i = 0; i < nIter; i++) { + cl_ulong startTime = 0, endTime = 0; + error_ = _wrapper->clGetEventProfilingInfo( + events[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0); + CHECK_RESULT(error_, "clGetEventProfilingInfo failed"); + error_ = _wrapper->clGetEventProfilingInfo( + events[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0); + CHECK_RESULT(error_, "clGetEventProfilingInfo failed"); + + _wrapper->clReleaseEvent(events[i]); + sec2 += endTime - startTime; + } + double sec = timer.GetElapsedTime(); + delete[] events; + + // read speed in GB/s + double perf = ((double)nBytes * nIter * (double)(1e-09)) / sec; + double perf2 = ((double)nBytes * nIter) / sec2; + _perfInfo = (float)perf2; + float perfInfo = (float)perf; + char buf[256]; + SNPRINTF(buf, sizeof(buf), + " (%8d bytes, %s) i:%4d Wall time Perf: %.2f (GB/s)", nBytes, + mem_type_, nIter, perfInfo); + testDescString = buf; +} + +unsigned int OCLPerfVerticalFetch::close(void) { + if (!skip_) { + if (srcBuffer_) { + error_ = _wrapper->clReleaseMemObject(srcBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(srcBuffer_) failed"); + } + + if (dstBuffer_) { + error_ = _wrapper->clReleaseMemObject(dstBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(srcBuffer_) failed"); + } + } + if (ptr_ != nullptr) { + free(ptr_); + ptr_ = nullptr; + } + + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfVerticalFetch.h b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfVerticalFetch.h new file mode 100644 index 0000000000..d94e1eb22f --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/OCLPerfVerticalFetch.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#pragma once +#include "OCLTestImp.h" + +class OCLPerfVerticalFetch : public OCLTestImp { + public: + OCLPerfVerticalFetch(); + virtual ~OCLPerfVerticalFetch(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + cl_mem srcBuffer_; + cl_mem dstBuffer_; + unsigned int nWorkItems; // number of GPU work items + unsigned int wgs; // work group size + unsigned int nBytes; // input and output buffer size + unsigned int nIter; // overall number of timing loops + cl_uint inputData; // input data to fill the input buffer + bool skip_; + void* ptr_; + const char* mem_type_; + cl_uint dim; + size_t gws[3]; + size_t lws[3]; + cl_uint numCachedPixels_; +}; diff --git a/projects/clr/opencl/tests/ocltst/module/perf/TestList.cpp b/projects/clr/opencl/tests/ocltst/module/perf/TestList.cpp new file mode 100644 index 0000000000..343471a45d --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/TestList.cpp @@ -0,0 +1,191 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLTestListImp.h" + +// +// Includes for tests +// +#include "OCLPerfAES256.h" +#include "OCLPerfAtomicSpeed.h" +#include "OCLPerfBufferCopyOverhead.h" +#include "OCLPerfBufferCopySpeed.h" +#include "OCLPerfBufferReadSpeed.h" +#include "OCLPerfBufferWriteSpeed.h" +#include "OCLPerfCPUMemSpeed.h" +#include "OCLPerfCommandQueue.h" +#include "OCLPerfConcurrency.h" +#include "OCLPerfDevMemReadSpeed.h" +#include "OCLPerfDevMemWriteSpeed.h" +#include "OCLPerfDeviceConcurrency.h" +#include "OCLPerfDeviceEnqueue.h" +#include "OCLPerfDispatchSpeed.h" +#include "OCLPerfDoubleDMA.h" +#include "OCLPerfDoubleDMASeq.h" +#include "OCLPerfFillBuffer.h" +#include "OCLPerfFillImage.h" +#include "OCLPerfFlush.h" +#include "OCLPerfGenericBandwidth.h" +#include "OCLPerfGenoilSiaMiner.h" +#include "OCLPerfImageCopyCorners.h" +#include "OCLPerfImageCopySpeed.h" +#include "OCLPerfImageMapUnmap.h" +#include "OCLPerfImageReadSpeed.h" +#include "OCLPerfImageSampleRate.h" +#include "OCLPerfImageWriteSpeed.h" +#include "OCLPerfKernelArguments.h" +#include "OCLPerfLDSLatency.h" +#include "OCLPerfLDSReadSpeed.h" +#include "OCLPerfMandelbrot.h" +#include "OCLPerfMapBufferReadSpeed.h" +#include "OCLPerfMapBufferWriteSpeed.h" +#include "OCLPerfMapImageReadSpeed.h" +#include "OCLPerfMapImageWriteSpeed.h" +#include "OCLPerfMatrixTranspose.h" +#include "OCLPerfMemCombine.h" +#include "OCLPerfMemCreate.h" +#include "OCLPerfMemLatency.h" +#include "OCLPerfPinnedBufferReadSpeed.h" +#include "OCLPerfPinnedBufferWriteSpeed.h" +#include "OCLPerfPipeCopySpeed.h" +#include "OCLPerfSHA256.h" +#include "OCLPerfSampleRate.h" +#include "OCLPerfScalarReplArrayElem.h" +#include "OCLPerfSdiP2PCopy.h" +#include "OCLPerfSepia.h" +#include "OCLPerfTextureMemLatency.h" +#include "OCLPerfUAVReadSpeed.h" +#include "OCLPerfUAVReadSpeedHostMem.h" +#include "OCLPerfUAVWriteSpeedHostMem.h" +#include "OCLPerfVerticalFetch.h" +// 2.0 +#include "OCLPerf3DImageWriteSpeed.h" +#include "OCLPerfAtomicSpeed20.h" +#include "OCLPerfDeviceEnqueue2.h" +#include "OCLPerfDeviceEnqueueEvent.h" +#include "OCLPerfDeviceEnqueueSier.h" +#include "OCLPerfImageCreate.h" +#include "OCLPerfImageReadWrite.h" +#include "OCLPerfImageReadsRGBA.h" +#include "OCLPerfProgramGlobalRead.h" +#include "OCLPerfProgramGlobalWrite.h" +#include "OCLPerfSVMAlloc.h" +#include "OCLPerfSVMKernelArguments.h" +#include "OCLPerfSVMMap.h" +#include "OCLPerfSVMMemFill.h" +#include "OCLPerfSVMMemcpy.h" +#include "OCLPerfSVMSampleRate.h" +#include "OCLPerfUncoalescedRead.h" + +// +// Helper macro for adding tests +// +template +static void* dictionary_CreateTestFunc(void) { + return new T(); +} + +#define TEST(name) \ + { #name, &dictionary_CreateTestFunc < name> } + +TestEntry TestList[] = { + TEST(OCLPerfUAVReadSpeed), + TEST(OCLPerfUAVReadSpeedHostMem), + TEST(OCLPerfUAVWriteSpeedHostMem), + TEST(OCLPerfLDSReadSpeed), + TEST(OCLPerfDispatchSpeed), + TEST(OCLPerfMapBufferReadSpeed), + TEST(OCLPerfMapBufferWriteSpeed), + TEST(OCLPerfBufferReadSpeed), + TEST(OCLPerfBufferReadRectSpeed), + TEST(OCLPerfPinnedBufferReadSpeed), + TEST(OCLPerfPinnedBufferReadRectSpeed), + TEST(OCLPerfBufferWriteSpeed), + TEST(OCLPerfBufferWriteRectSpeed), + TEST(OCLPerfPinnedBufferWriteSpeed), + TEST(OCLPerfPinnedBufferWriteRectSpeed), + TEST(OCLPerfBufferCopySpeed), + TEST(OCLPerfBufferCopyRectSpeed), + TEST(OCLPerfMapImageReadSpeed), + TEST(OCLPerfMapImageWriteSpeed), + TEST(OCLPerfMemCombine), + TEST(OCLPerfImageReadSpeed), + TEST(OCLPerfPinnedImageReadSpeed), + TEST(OCLPerfImageWriteSpeed), + TEST(OCLPerfPinnedImageWriteSpeed), + TEST(OCLPerfImageCopySpeed), + TEST(OCLPerfCPUMemSpeed), + TEST(OCLPerfMandelbrot), + TEST(OCLPerfAsyncMandelbrot), + TEST(OCLPerfConcurrency), + TEST(OCLPerfDeviceConcurrency), + TEST(OCLPerfAES256), + TEST(OCLPerfSHA256), + TEST(OCLPerfAtomicSpeed), + TEST(OCLPerfMatrixTranspose), + TEST(OCLPerfImageCopyCorners), + TEST(OCLPerfScalarReplArrayElem), + TEST(OCLPerfSdiP2PCopy), + TEST(OCLPerfSepia), + TEST(OCLPerfFlush), + TEST(OCLPerfMemCreate), + TEST(OCLPerfImageMapUnmap), + TEST(OCLPerfCommandQueue), + TEST(OCLPerfKernelArguments), + TEST(OCLPerfDoubleDMA), + TEST(OCLPerfDoubleDMASeq), + TEST(OCLPerfMemLatency), + TEST(OCLPerfTextureMemLatency), + TEST(OCLPerfSampleRate), + TEST(OCLPerfImageSampleRate), + TEST(OCLPerfBufferCopyOverhead), + TEST(OCLPerfMapDispatchSpeed), + TEST(OCLPerfDeviceEnqueue), + TEST(OCLPerfPipeCopySpeed), + TEST(OCLPerfGenericBandwidth), + TEST(OCLPerfLDSLatency), + TEST(OCLPerfDeviceEnqueue2), + TEST(OCLPerfSVMAlloc), + TEST(OCLPerfSVMMap), + TEST(OCLPerfDeviceEnqueueEvent), + TEST(OCLPerfSVMKernelArguments), + TEST(OCLPerfDeviceEnqueueSier), + TEST(OCLPerfProgramGlobalRead), + TEST(OCLPerfProgramGlobalWrite), + TEST(OCLPerfAtomicSpeed20), + TEST(OCLPerfSVMSampleRate), + TEST(OCLPerfImageCreate), + TEST(OCLPerfImageReadsRGBA), + TEST(OCLPerf3DImageWriteSpeed), + TEST(OCLPerfImageReadWrite), + TEST(OCLPerfSVMMemcpy), + TEST(OCLPerfSVMMemFill), + TEST(OCLPerfFillBuffer), + TEST(OCLPerfFillImage), + TEST(OCLPerfUncoalescedRead), + TEST(OCLPerfGenoilSiaMiner), + TEST(OCLPerfDevMemReadSpeed), + TEST(OCLPerfDevMemWriteSpeed), + TEST(OCLPerfVerticalFetch), +}; + +unsigned int TestListCount = sizeof(TestList) / sizeof(TestList[0]); +unsigned int TestLibVersion = 0; +const char* TestLibName = "oclperf"; diff --git a/projects/clr/opencl/tests/ocltst/module/perf/oclperf.exclude b/projects/clr/opencl/tests/ocltst/module/perf/oclperf.exclude new file mode 100644 index 0000000000..5004785a63 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/perf/oclperf.exclude @@ -0,0 +1,28 @@ +# We don't need to run regressions on these tests, they are purely for performance testing and debugging +OCLPerfMemLatency +OCLPerfTextureMemLatency +OCLPerfSampleRate +OCLPerfImageSampleRate +OCLPerfBufferCopyOverhead +OCLPerfDeviceEnqueue +OCLPerfPipeCopySpeed +OCLPerfGenericBandwidth +OCLPerfLDSLatency +OCLPerfFillBuffer +OCLPerfDeviceEnqueue2 +OCLPerfDeviceEnqueueEvent +OCLPerfDeviceEnqueueSier +OCLPerfSVMAlloc +OCLPerfSVMMap +OCLPerfSVMKernelArguments +OCLPerfProgramGlobalRead +OCLPerfProgramGlobalWrite +OCLPerfAtomicSpeed20 +OCLPerfSVMSampleRate +OCLPerfImageCreate +OCLPerfImageReadsRGBA +OCLPerf3DImageWriteSpeed +OCLPerfImageReadWrite +OCLPerfSVMMemcpy +OCLPerfSVMMemFill +OCLPerfFillImage diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncMap.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncMap.cpp new file mode 100644 index 0000000000..7e5df567ab --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncMap.cpp @@ -0,0 +1,98 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLAsyncMap.h" + +#include +#include +#include +#include + +#include "CL/cl.h" + +static const size_t BufSize = 0x800000; +static const size_t MapRegion = 0x100000; +static const unsigned int NumMaps = BufSize / MapRegion; + +OCLAsyncMap::OCLAsyncMap() { _numSubTests = 1; } + +OCLAsyncMap::~OCLAsyncMap() {} + +void OCLAsyncMap::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + cl_mem buffer; + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + BufSize * sizeof(cl_uint), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLAsyncMap::run(void) { + cl_uint* values[NumMaps]; + cl_mem mapBuffer = buffers()[0]; + size_t offset = 0; + size_t region = MapRegion * sizeof(cl_uint); + + for (unsigned int i = 0; i < NumMaps; ++i) { + values[i] = reinterpret_cast(_wrapper->clEnqueueMapBuffer( + cmdQueues_[_deviceId], mapBuffer, CL_TRUE, (CL_MAP_READ | CL_MAP_WRITE), + offset, region, 0, NULL, NULL, &error_)); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueMapBuffer() failed"); + offset += region; + } + + for (unsigned int i = 0; i < NumMaps; ++i) { + for (unsigned int j = 0; j < MapRegion; ++j) { + values[i][j] = i; + } + } + + for (unsigned int i = 0; i < NumMaps; ++i) { + error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], mapBuffer, + values[i], 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueMapBuffer() failed"); + } + + values[0] = reinterpret_cast(_wrapper->clEnqueueMapBuffer( + cmdQueues_[_deviceId], mapBuffer, CL_TRUE, CL_MAP_READ, 0, + BufSize * sizeof(cl_uint), 0, NULL, NULL, &error_)); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueMapBuffer() failed"); + + for (unsigned int i = 0; i < NumMaps; ++i) { + values[i] = values[0] + i * MapRegion; + for (unsigned int j = 0; j < MapRegion; ++j) { + CHECK_RESULT((values[i][j] != i), "validation failed"); + } + } + + error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], mapBuffer, + values[0], 0, NULL, NULL); + + _wrapper->clFinish(cmdQueues_[_deviceId]); +} + +unsigned int OCLAsyncMap::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncMap.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncMap.h new file mode 100644 index 0000000000..93cb3f52a3 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncMap.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_ASYNC_MAP_H_ +#define _OCL_ASYNC_MAP_H_ + +#include "OCLTestImp.h" + +class OCLAsyncMap : public OCLTestImp { + public: + OCLAsyncMap(); + virtual ~OCLAsyncMap(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); +}; + +#endif // _OCL_ASYNC_MAP_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncTransfer.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncTransfer.cpp new file mode 100644 index 0000000000..15df346b6e --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncTransfer.cpp @@ -0,0 +1,139 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLAsyncTransfer.h" + +#include +#include +#include +#include + +#include "CL/cl.h" + +static const size_t Iterations = 0x100; +static const size_t IterationDivider = 2; +static const size_t MaxBuffers = IterationDivider; +static const size_t BufSize = 0x800000; + +const static char* strKernel = + "__kernel void factorial(__global uint* out) \n" + "{ \n" + " uint id = get_global_id(0); \n" + " uint factorial = 1; \n" + " for (uint i = 1; i < (id / 0x10000); ++i) \n" + " { \n" + " factorial *= i; \n" + " } \n" + " out[id] = factorial; \n" + "} \n"; + +OCLAsyncTransfer::OCLAsyncTransfer() { _numSubTests = 1; } + +OCLAsyncTransfer::~OCLAsyncTransfer() {} + +void OCLAsyncTransfer::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "factorial", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + cl_mem buffer; + for (size_t i = 0; i < MaxBuffers; ++i) { + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + BufSize * sizeof(cl_uint), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + } + + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR, + BufSize * sizeof(cl_uint), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLAsyncTransfer::run(void) { + void* values; + CPerfCounter timer; + cl_mem mapBuffer = buffers()[MaxBuffers]; + + values = _wrapper->clEnqueueMapBuffer( + cmdQueues_[_deviceId], mapBuffer, true, (CL_MAP_READ | CL_MAP_WRITE), 0, + BufSize * sizeof(cl_uint), 0, NULL, NULL, &error_); + + timer.Reset(); + timer.Start(); + size_t x; + for (x = 0; x < Iterations / IterationDivider; x++) { + for (size_t y = 0; y < IterationDivider; ++y) { + cl_mem buffer = buffers()[y]; + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + size_t gws[1] = {BufSize}; + error_ = _wrapper->clEnqueueNDRangeKernel( + cmdQueues_[_deviceId], kernel_, 1, NULL, gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + + cl_mem readBuffer = buffers()[0]; + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], readBuffer, + false, 0, BufSize * sizeof(cl_uint), + values, 0, NULL, NULL); + _wrapper->clFlush(cmdQueues_[_deviceId]); + + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); + } + _wrapper->clFinish(cmdQueues_[_deviceId]); + timer.Stop(); + + double sec = timer.GetElapsedTime(); + // Buffer read bandwidth in GB/s + double perf = ((double)BufSize * sizeof(cl_uint) * x * (double)(1e-09)) / sec; + + printf(" Time: %.2f sec, BW: %.2f GB/s ", sec, perf); + + error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], mapBuffer, + values, 0, NULL, NULL); + _wrapper->clFinish(cmdQueues_[_deviceId]); +} + +unsigned int OCLAsyncTransfer::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncTransfer.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncTransfer.h new file mode 100644 index 0000000000..96303e7de2 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAsyncTransfer.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_ASYNC_TRANSFER_H_ +#define _OCL_ASYNC_TRANSFER_H_ + +#include "OCLTestImp.h" + +class OCLAsyncTransfer : public OCLTestImp { + public: + OCLAsyncTransfer(); + virtual ~OCLAsyncTransfer(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); +}; + +#endif // _OCL_ASYNC_TRANSFER_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLAtomicCounter.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAtomicCounter.cpp new file mode 100644 index 0000000000..083cb45ed6 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAtomicCounter.cpp @@ -0,0 +1,168 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLAtomicCounter.h" + +#include +#include +#include + +#include "CL/cl.h" + +const static unsigned int MaxCounters = 2; +const static char* strKernel = + "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable \n" + "__kernel void atomic_test( \n" + " counter32_t counter0, counter32_t counter1, global uint* out_val) \n" + "{ \n" + " if (!get_global_id(0)) { \n" + " uint val0 = atomic_inc(counter0); \n" + " uint val1 = atomic_dec(counter1); \n" + " out_val[0] = val0; \n" + " out_val[1] = val1; \n" + " } \n" + "} \n"; + +OCLAtomicCounter::OCLAtomicCounter() { + _numSubTests = 1; + failed_ = false; +} + +OCLAtomicCounter::~OCLAtomicCounter() {} + +void OCLAtomicCounter::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening"); + + char name[1024] = {0}; + size_t size = 0; + + if (deviceId >= deviceCount_) { + failed_ = true; + return; + } + + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 1024, + name, &size); + if (!strstr(name, "cl_ext_atomic_counter")) { + printf("Atomic counter extension is required for this test!\n"); + failed_ = true; + return; + } + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], "-legacy", + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "atomic_test", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + cl_mem buffer; + for (unsigned int i = 0; i < MaxCounters; ++i) { + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + sizeof(cl_uint), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + } + + buffer = + _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + MaxCounters * sizeof(cl_uint), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLAtomicCounter::run(void) { + if (failed_) { + return; + } + cl_uint initVal[2] = {5, 10}; + for (unsigned int i = 0; i < MaxCounters; ++i) { + error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], buffers()[i], + true, 0, sizeof(cl_uint), + &initVal[i], 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed"); + } + + for (unsigned int i = 0; i < MaxCounters + 1; ++i) { + cl_mem buffer = buffers()[i]; + error_ = _wrapper->clSetKernelArg(kernel_, i, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + } + + size_t gws[1] = {64}; + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + cl_uint outputV[MaxCounters] = {0}; + + // Find the new counter value + initVal[0]++; + initVal[1]--; + + for (unsigned int i = 0; i < MaxCounters; ++i) { + cl_mem buffer = buffers()[i]; + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers()[i], + true, 0, sizeof(cl_uint), + &outputV[i], 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); + if (initVal[i] != outputV[i]) { + printf("%d != %d", initVal[i], outputV[i]); + CHECK_RESULT(true, " - Incorrect result for counter!\n"); + } + } + + // Restore the original value to check the returned result in the kernel + initVal[0]--; + initVal[1]++; + + cl_mem buffer = buffers()[MaxCounters]; + error_ = _wrapper->clEnqueueReadBuffer( + cmdQueues_[_deviceId], buffers()[MaxCounters], true, 0, + MaxCounters * sizeof(cl_uint), outputV, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); + for (unsigned int i = 0; i < MaxCounters; ++i) { + if (initVal[i] != outputV[i]) { + printf("%d != %d", initVal[i], outputV[i]); + CHECK_RESULT(true, + " - Incorrect result for counter inside kernel. Returned " + "value != original.\n"); + } + } +} + +unsigned int OCLAtomicCounter::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLAtomicCounter.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAtomicCounter.h new file mode 100644 index 0000000000..d4bdb1a453 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLAtomicCounter.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_ATOMIC_COUNTER_H_ +#define _OCL_ATOMIC_COUNTER_H_ + +#include "OCLTestImp.h" + +class OCLAtomicCounter : public OCLTestImp { + public: + OCLAtomicCounter(); + virtual ~OCLAtomicCounter(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; +}; + +#endif // _OCL_ATOMIC_COUNTER_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLBlitKernel.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLBlitKernel.cpp new file mode 100644 index 0000000000..b8a07c0df4 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLBlitKernel.cpp @@ -0,0 +1,612 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLBlitKernel.h" + +#include +#include +#include +#include + +#include "CL/cl.h" + +const static cl_uint Stages = 4; +const static cl_uint ThreadsForCheck = 1 << Stages; + +#define KERNEL_CODE(...) #__VA_ARGS__ + +const static char* strKernel = + KERNEL_CODE( + \n + \x23 if OCL20 + \n + extern void __amd_scheduler(__global void *, __global void *, uint); + \n + \x23 endif + \n + extern void __amd_copyBufferToImage( + __global uint*, __write_only image2d_array_t, ulong4, + int4, int4, uint4, ulong4); + + extern void __amd_copyImageToBuffer( + __read_only image2d_array_t, __global uint*, __global ushort*, + __global uchar*, int4, ulong4, int4, uint4, ulong4); + + extern void __amd_copyImage( + __read_only image2d_array_t, __write_only image2d_array_t, + int4, int4, int4); + + extern void __amd_copyImage1DA( + __read_only image2d_array_t, __write_only image2d_array_t, + int4, int4, int4); + + extern void __amd_copyBufferRect( + __global uchar*, __global uchar*, + ulong4, ulong4, ulong4); + + extern void __amd_copyBufferRectAligned( + __global uint*, __global uint*, + ulong4, ulong4, ulong4); + + extern void __amd_copyBuffer( + __global uchar*, __global uchar*, + ulong, ulong, ulong, uint); + + extern void __amd_copyBufferAligned( + __global uint*, __global uint*, + ulong, ulong, ulong, uint); + + extern void __amd_fillBuffer( + __global uchar*, __global uint*, __constant uchar*, + uint, ulong, ulong); + + extern void __amd_fillImage( + __write_only image2d_array_t, + float4, int4, uint4, int4, int4, uint); + + __kernel void copyBufferToImage( + __global uint* src, + __write_only image2d_array_t dst, + ulong4 srcOrigin, + int4 dstOrigin, + int4 size, + uint4 format, + ulong4 pitch) + { + __amd_copyBufferToImage(src, dst, srcOrigin, dstOrigin, size, format, pitch); + } + + __kernel void copyImageToBuffer( + __read_only image2d_array_t src, + __global uint* dstUInt, + __global ushort* dstUShort, + __global uchar* dstUChar, + int4 srcOrigin, + ulong4 dstOrigin, + int4 size, + uint4 format, + ulong4 pitch) + { + __amd_copyImageToBuffer(src, dstUInt, dstUShort, dstUChar, + srcOrigin, dstOrigin, size, format, pitch); + } + + __kernel void copyImage( + __read_only image2d_array_t src, + __write_only image2d_array_t dst, + int4 srcOrigin, + int4 dstOrigin, + int4 size) + { + __amd_copyImage(src, dst, srcOrigin, dstOrigin, size); + } + + __kernel void copyImage1DA( + __read_only image2d_array_t src, + __write_only image2d_array_t dst, + int4 srcOrigin, + int4 dstOrigin, + int4 size) + { + __amd_copyImage1DA(src, dst, srcOrigin, dstOrigin, size); + } + + __kernel void copyBufferRect( + __global uchar* src, + __global uchar* dst, + ulong4 srcRect, + ulong4 dstRect, + ulong4 size) + { + __amd_copyBufferRect(src, dst, srcRect, dstRect, size); + } + + __kernel void copyBufferRectAligned( + __global uint* src, + __global uint* dst, + ulong4 srcRect, + ulong4 dstRect, + ulong4 size) + { + __amd_copyBufferRectAligned(src, dst, srcRect, dstRect, size); + } + + __kernel void copyBuffer( + __global uchar* srcI, + __global uchar* dstI, + ulong srcOrigin, + ulong dstOrigin, + ulong size, + uint remain) + { + __amd_copyBuffer(srcI, dstI, srcOrigin, dstOrigin, size, remain); + } + + __kernel void copyBufferAligned( + __global uint* src, + __global uint* dst, + ulong srcOrigin, + ulong dstOrigin, + ulong size, + uint alignment) + { + __amd_copyBufferAligned(src, dst, srcOrigin, dstOrigin, size, alignment); + } + + __kernel void fillBuffer( + __global uchar* bufUChar, + __global uint* bufUInt, + __constant uchar* pattern, + uint patternSize, + ulong offset, + ulong size) + { + __amd_fillBuffer(bufUChar, bufUInt, pattern, patternSize, offset, size); + } + + __kernel void fillImage( + __write_only image2d_array_t image, + float4 patternFLOAT4, + int4 patternINT4, + uint4 patternUINT4, + int4 origin, + int4 size, + uint type) + { + __amd_fillImage(image, patternFLOAT4, patternINT4, patternUINT4, + origin, size, type); + } + \n + \x23 if OCL20 + \n + typedef struct _HsaAqlDispatchPacket { + uint mix; + ushort workgroup_size[3]; + ushort reserved2; + uint grid_size[3]; + uint private_segment_size_bytes; + uint group_segment_size_bytes; + ulong kernel_object_address; + ulong kernel_arg_address; + ulong reserved3; + ulong completion_signal; + } HsaAqlDispatchPacket; + \n + // This is an OpenCLized hsa_control_directives_t + typedef struct _AmdControlDirectives { + ulong enabled_control_directives; + ushort enable_break_exceptions; + ushort enable_detect_exceptions; + uint max_dynamic_group_size; + ulong max_flat_grid_size; + uint max_flat_workgroup_size; + uchar required_dim; + uchar reserved1[3]; + ulong required_grid_size[3]; + uint required_workgroup_size[3]; + uchar reserved2[60]; + } AmdControlDirectives; + \n + // This is an OpenCLized amd_kernel_code_t + typedef struct _AmdKernelCode { + uint amd_kernel_code_version_major; + uint amd_kernel_code_version_minor; + ushort amd_machine_kind; + ushort amd_machine_version_major; + ushort amd_machine_version_minor; + ushort amd_machine_version_stepping; + long kernel_code_entry_byte_offset; + long kernel_code_prefetch_byte_offset; + ulong kernel_code_prefetch_byte_size; + ulong max_scratch_backing_memory_byte_size; + uint compute_pgm_rsrc1; + uint compute_pgm_rsrc2; + uint kernel_code_properties; + uint workitem_private_segment_byte_size; + uint workgroup_group_segment_byte_size; + uint gds_segment_byte_size; + ulong kernarg_segment_byte_size; + uint workgroup_fbarrier_count; + ushort wavefront_sgpr_count; + ushort workitem_vgpr_count; + ushort reserved_vgpr_first; + ushort reserved_vgpr_count; + ushort reserved_sgpr_first; + ushort reserved_sgpr_count; + ushort debug_wavefront_private_segment_offset_sgpr; + ushort debug_private_segment_buffer_sgpr; + uchar kernarg_segment_alignment; + uchar group_segment_alignment; + uchar private_segment_alignment; + uchar wavefront_size; + int call_convention; + uchar reserved1[12]; + ulong runtime_loader_kernel_symbol; + AmdControlDirectives control_directives; + } AmdKernelCode; + \n + typedef struct _HwDispatchHeader { + uint writeData0; // CP WRITE_DATA write to rewind for memory + uint writeData1; + uint writeData2; + uint writeData3; + uint rewind; // REWIND execution + uint startExe; // valid bit + uint condExe0; // 0xC0032200 -- TYPE 3, COND_EXEC + uint condExe1; // 0x00000204 ---- + uint condExe2; // 0x00000000 ---- + uint condExe3; // 0x00000000 ---- + uint condExe4; // 0x00000000 ---- + } HwDispatchHeader; + \n + typedef struct _HwDispatch { + uint packet0; // 0xC0067602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (6 values) + uint offset0; // 0x00000204 ---- OFFSET + uint startX; // 0x00000000 ---- COMPUTE_START_X: START = 0x0 + uint startY; // 0x00000000 ---- COMPUTE_START_Y: START = 0x0 + uint startZ; // 0x00000000 ---- COMPUTE_START_Z: START = 0x0 + uint wrkGrpSizeX; // 0x00000000 ---- COMPUTE_NUM_THREAD_X: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0 + uint wrkGrpSizeY; // 0x00000000 ---- COMPUTE_NUM_THREAD_Y: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0 + uint wrkGrpSizeZ; // 0x00000000 ---- COMPUTE_NUM_THREAD_Z: NUM_THREAD_FULL = 0x0, NUM_THREAD_PARTIAL = 0x0 + uint packet1; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values) + uint offset1; // 0x0000020C ---- OFFSET + uint isaLo; // 0x00000000 ---- COMPUTE_PGM_LO: DATA = 0x0 + uint isaHi; // 0x00000000 ---- COMPUTE_PGM_HI: DATA = 0x0, INST_ATC__CI__VI = 0x0 + uint packet2; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values) + uint offset2; // 0x00000212 ---- OFFSET + uint resource1; // 0x00000000 ---- COMPUTE_PGM_RSRC1 + uint resource2; // 0x00000000 ---- COMPUTE_PGM_RSRC2 + uint packet3; // 0xc0017602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (1 value) + uint offset3; // 0x00000215 ---- OFFSET + uint pad31; // 0x000003ff ---- COMPUTE_RESOURCE_LIMITS + uint packet31; // 0xC0067602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (1 value) + uint offset31; // 0x00000218 ---- OFFSET + uint ringSize; // 0x00000000 ---- COMPUTE_TMPRING_SIZE: WAVES = 0x0, WAVESIZE = 0x0 + uint user0; // 0xC0047602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (4 values) + uint offsUser0; // 0x00000240 ---- OFFSET + uint scratchLo; // 0x00000000 ---- COMPUTE_USER_DATA_0: DATA = 0x0 + uint scratchHi; // 0x80000000 ---- COMPUTE_USER_DATA_1: DATA = 0x80000000 + uint scratchSize; // 0x00000000 ---- COMPUTE_USER_DATA_2: DATA = 0x0 + uint padUser; // 0x00EA7FAC ---- COMPUTE_USER_DATA_3: DATA = 0xEA7FAC + uint user1; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values) + uint offsUser1; // 0x00000244 ---- OFFSET + uint aqlPtrLo; // 0x00000000 ---- COMPUTE_USER_DATA_4: DATA = 0x0 + uint aqlPtrHi; // 0x00000000 ---- COMPUTE_USER_DATA_5: DATA = 0x0 + uint user2; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values) + uint offsUser2; // 0x00000246 ---- OFFSET + uint hsaQueueLo; // 0x00000000 ---- COMPUTE_USER_DATA_6: DATA = 0x0 + uint hsaQueueHi; // 0x00000000 ---- COMPUTE_USER_DATA_7: DATA = 0x0 + uint user3; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values) + uint offsUser3; // 0x00000246 ---- OFFSET + uint argsLo; // 0x00000000 ---- COMPUTE_USER_DATA_8: DATA = 0x0 + uint argsHi; // 0x00000000 ---- COMPUTE_USER_DATA_9: DATA = 0x0 + uint copyData; // 0xC0044000 -- TYPE 3, COPY_DATA + uint copyDataFlags; // 0x00000405 ---- srcSel 0x5, destSel 0x4, countSel 0x0, wrConfirm 0x0, engineSel 0x0 + uint scratchAddrLo; // 0x000201C4 ---- srcAddressLo + uint scratchAddrHi; // 0x00000000 ---- srcAddressHi + uint shPrivateLo; // 0x00002580 ---- dstAddressLo + uint shPrivateHi; // 0x00000000 ---- dstAddressHi + uint user4; // 0xC0027602 -- TYPE 3, SET_SH_REG, TYPE:COMPUTE (2 values) + uint offsUser4; // 0x00000248 ---- OFFSET + uint scratchOffs; // 0x00000000 ---- COMPUTE_USER_DATA_10: DATA = 0x0 + uint privSize; // 0x00000030 ---- COMPUTE_USER_DATA_11: DATA = 0x30 + uint packet4; // 0xC0031502 -- TYPE 3, DISPATCH_DIRECT, TYPE:COMPUTE + uint glbSizeX; // 0x00000000 + uint glbSizeY; // 0x00000000 + uint glbSizeZ; // 0x00000000 + uint padd41; // 0x00000021 + } HwDispatch; + \n + static const uint WavefrontSize = 64; + static const uint MaxWaveSize = 0x400; + static const uint UsrRegOffset = 0x240; + static const uint Pm4Nop = 0xC0001002; + static const uint Pm4UserRegs = 0xC0007602; + static const uint Pm4CopyReg = 0xC0044000; + static const uint PrivateSegEna = 0x1; + static const uint DispatchEna = 0x2; + static const uint QueuePtrEna = 0x4; + static const uint KernelArgEna = 0x8; + static const uint FlatScratchEna = 0x20; + \n + uint GetCmdTemplateHeaderSize() { return sizeof(HwDispatchHeader); } + \n + uint GetCmdTemplateDispatchSize() { return sizeof(HwDispatch); } + \n + void EmptyCmdTemplateDispatch(ulong cmdBuf) + { + volatile __global HwDispatch* dispatch = (volatile __global HwDispatch*)cmdBuf; + dispatch->glbSizeX = 0; + dispatch->glbSizeY = 0; + dispatch->glbSizeZ = 0; + } + \n + void RunCmdTemplateDispatch( + ulong cmdBuf, + __global HsaAqlDispatchPacket* aqlPkt, + ulong scratch, + ulong hsaQueue, + uint scratchSize, + uint scratchOffset, + uint numMaxWaves, + uint useATC) + \n + { + volatile __global HwDispatch* dispatch = (volatile __global HwDispatch*)cmdBuf; + uint usrRegCnt = 0; + + // Program workgroup size + dispatch->wrkGrpSizeX = aqlPkt->workgroup_size[0]; + dispatch->wrkGrpSizeY = aqlPkt->workgroup_size[1]; + dispatch->wrkGrpSizeZ = aqlPkt->workgroup_size[2]; + + // ISA address + __global AmdKernelCode* kernelObj = (__global AmdKernelCode*)aqlPkt->kernel_object_address; + ulong isa = aqlPkt->kernel_object_address + kernelObj->kernel_code_entry_byte_offset; + + dispatch->isaLo = (uint)(isa >> 8); + dispatch->isaHi = (uint)(isa >> 40) | (useATC ? 0x100 : 0); + + // Program PGM resource registers + dispatch->resource1 = kernelObj->compute_pgm_rsrc1; + dispatch->resource2 = kernelObj->compute_pgm_rsrc2; + + uint flags = kernelObj->kernel_code_properties; + uint privateSize = kernelObj->workitem_private_segment_byte_size; + + uint ldsSize = aqlPkt->group_segment_size_bytes + + kernelObj->workgroup_group_segment_byte_size; + + // Align up the LDS blocks 128 * 4(in DWORDs) + uint ldsBlocks = (ldsSize + 511) >> 9; + + dispatch->resource2 |= (ldsBlocks << 15); + + // Private/scratch segment was enabled + if (flags & PrivateSegEna) { + uint waveSize = privateSize * WavefrontSize; + // 256 DWRODs is the minimum for SQ + waveSize = max(MaxWaveSize, waveSize); + + uint numWaves = scratchSize / waveSize; + + numWaves = min(numWaves, numMaxWaves); + + dispatch->ringSize = numWaves; + dispatch->ringSize |= (waveSize >> 10) << 12; + dispatch->user0 = Pm4UserRegs | (4 << 16); + dispatch->scratchLo = (uint)scratch; + dispatch->scratchHi = ((uint)(scratch >> 32)) | 0x80000000; // Enables swizzle + dispatch->scratchSize = scratchSize; + usrRegCnt += 4; + } + else { + dispatch->ringSize = 0; + dispatch->user0 = Pm4Nop | (4 << 16); + } + + // Pointer to the AQL dispatch packet + dispatch->user1 = (flags & DispatchEna) ? (Pm4UserRegs | (2 << 16)) : (Pm4Nop | (2 << 16)); + dispatch->offsUser1 = UsrRegOffset + usrRegCnt; + usrRegCnt += (flags & DispatchEna) ? 2 : 0; + ulong gpuAqlPtr = (ulong)aqlPkt; + dispatch->aqlPtrLo = (uint)gpuAqlPtr; + dispatch->aqlPtrHi = (uint)(gpuAqlPtr >> 32); + + // Pointer to the AQL queue header + if (flags & QueuePtrEna) { + dispatch->user2 = Pm4UserRegs | (2 << 16); + dispatch->offsUser2 = UsrRegOffset + usrRegCnt; + usrRegCnt += 2; + dispatch->hsaQueueLo = (uint)hsaQueue; + dispatch->hsaQueueHi = (uint)(hsaQueue >> 32); + } + else { + dispatch->user2 = Pm4Nop | (2 << 16); + } + + // Pointer to the AQL kernel arguments + dispatch->user3 = (flags & KernelArgEna) ? (Pm4UserRegs | (2 << 16)) : (Pm4Nop | (2 << 16)); + dispatch->offsUser3 = UsrRegOffset + usrRegCnt; + usrRegCnt += (flags & KernelArgEna) ? 2 : 0; + dispatch->argsLo = (uint)aqlPkt->kernel_arg_address; + dispatch->argsHi = (uint)(aqlPkt->kernel_arg_address >> 32); + + // Provide pointer to the private/scratch buffer for the flat address + if (flags & FlatScratchEna) { + dispatch->copyData = Pm4CopyReg; + dispatch->scratchAddrLo = (uint)((scratch - scratchOffset) >> 16); + dispatch->offsUser4 = UsrRegOffset + usrRegCnt; + dispatch->scratchOffs = scratchOffset; + dispatch->privSize = privateSize; + } + else { + dispatch->copyData = Pm4Nop | (8 << 16); + } + + // Update the global launch grid + dispatch->glbSizeX = aqlPkt->grid_size[0]; + dispatch->glbSizeY = aqlPkt->grid_size[1]; + dispatch->glbSizeZ = aqlPkt->grid_size[2]; + } + \n + __kernel void scheduler( + __global void * queue, + __global void * params, + uint paramIdx) + { + __amd_scheduler(queue, params, paramIdx); + } + \n + \x23 endif + \n + ); + +enum { + BlitCopyImage = 0, + BlitCopyImage1DA, + BlitCopyImageToBuffer, + BlitCopyBufferToImage, + BlitCopyBufferRect, + BlitCopyBufferRectAligned, + BlitCopyBuffer, + BlitCopyBufferAligned, + FillBuffer, + FillImage, + Scheduler, + BlitTotal +}; + +static const char* BlitName[BlitTotal] = { + "copyImage", "copyImage1DA", "copyImageToBuffer", + "copyBufferToImage", "copyBufferRect", "copyBufferRectAligned", + "copyBuffer", "copyBufferAligned", "fillBuffer", + "fillImage", "scheduler", +}; + +OCLBlitKernel::OCLBlitKernel() { _numSubTests = 1; } + +OCLBlitKernel::~OCLBlitKernel() {} + +void OCLBlitKernel::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + char dbuffer[1024] = {0}; + CPerfCounter timer; + int sub = 0; + std::string options = "-cl-std=CL2.0 -DOCL20=1"; + + cl_device_type deviceType; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE, + sizeof(deviceType), &deviceType, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed"); + + if (!(deviceType & CL_DEVICE_TYPE_GPU)) { + testDescString = "GPU device is required for this test!\n"; + return; + } + + size_t param_size = 0; + char* strVersion = 0; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0, + 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strVersion = new char[param_size]; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, + param_size, strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (strVersion[7] < '2') { + options = "-DOCL20=0"; + sub = 1; + delete strVersion; + testDescString = "Currently it works for OCL20 devices only!\n"; + return; + } + delete strVersion; + + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DRIVER_VERSION, 0, + 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strVersion = new char[param_size]; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DRIVER_VERSION, + param_size, strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + std::string sch = strKernel; + static const char AmdScheduler[] = "amd_scheduler"; + static const char AmdSchedulerPal[] = "amd_scheduler_pal"; + static const char AmdSchedulerROCm[] = "amd_scheduler_rocm"; + const char* AmdSchedulerPatch = NULL; + size_t loc = 0; + + if (NULL != strstr(strVersion, "LC")) { + if (NULL != strstr(strVersion, "PAL")) { + AmdSchedulerPatch = AmdSchedulerPal; + } else if (NULL != strstr(strVersion, "HSA")) { + AmdSchedulerPatch = AmdSchedulerROCm; + } + } + delete strVersion; + + if (NULL != AmdSchedulerPatch) { + loc = sch.find(AmdScheduler); + sch.replace(loc, strlen(AmdScheduler), AmdSchedulerPatch); + loc = sch.find(AmdScheduler, (loc + strlen(AmdSchedulerPatch))); + sch.replace(loc, strlen(AmdScheduler), AmdSchedulerPatch); + } + + timer.Reset(); + timer.Start(); + + const char* strProgram = sch.c_str(); + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strProgram, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], + options.c_str(), NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + cl_kernel kernels[BlitTotal]; + for (int i = 0; i < BlitTotal - sub; ++i) { + kernels[i] = _wrapper->clCreateKernel(program_, BlitName[i], &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + } + timer.Stop(); + double sec = timer.GetElapsedTime(); + + time_ = (float)sec * 1000.f; + testDescString = "Blit kernel compilaiton time (ms):"; + + for (int i = 0; i < BlitTotal - sub; ++i) { + _wrapper->clReleaseKernel(kernels[i]); + } +} + +void OCLBlitKernel::run(void) { _perfInfo = time_; } + +unsigned int OCLBlitKernel::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLBlitKernel.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLBlitKernel.h new file mode 100644 index 0000000000..4f2d90957d --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLBlitKernel.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_BLIT_KERNEL_H_ +#define _OCL_BLIT_KERNEL_H_ + +#include "OCLTestImp.h" + +class OCLBlitKernel : public OCLTestImp { + public: + OCLBlitKernel(); + virtual ~OCLBlitKernel(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + float time_; +}; + +#endif // _OCL_BLIT_KERNEL_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLBufferFromImage.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLBufferFromImage.cpp new file mode 100644 index 0000000000..5278fe3998 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLBufferFromImage.cpp @@ -0,0 +1,289 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLBufferFromImage.h" + +#include +#include +#include +#include + +#define GROUP_SIZE 256 + +const static char strKernel[] = + "__kernel void buffer2bufferCopy( " + " \n" + " __global char* input, " + " \n" + " __global char* output) " + " \n" + "{ " + " \n" + " int coord = (int)(get_global_id(0)); " + " \n" + " output[coord] = input[coord]; " + " \n" + "} " + " \n"; + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *clCreateBufferFromImageAMD_fn)( + cl_context context, cl_mem image, cl_int *errcode_ret); +clCreateBufferFromImageAMD_fn clCreateBufferFromImageAMD; + +OCLBufferFromImage::OCLBufferFromImage() : OCLTestImp() { + _numSubTests = 2; + blockSizeX = GROUP_SIZE; + blockSizeY = 1; +} + +OCLBufferFromImage::~OCLBufferFromImage() {} + +void OCLBufferFromImage::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + buffer = bufferImage = clImage2D = bufferOut = NULL; + done = false; + pitchAlignment = 0; + bufferSize = 0; + + _openTest = test; + // Initialize random number seed + srand((unsigned int)time(NULL)); + + OCLTestImp::open(test, units, conversion, deviceId); + if (_errorFlag) return; + + cl_device_type deviceType; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE, + sizeof(deviceType), &deviceType, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed"); + + if (!(deviceType & CL_DEVICE_TYPE_GPU)) { + testDescString = "GPU device is required for this test!\n"; + done = true; + return; + } + + clCreateBufferFromImageAMD = + (clCreateBufferFromImageAMD_fn)clGetExtensionFunctionAddressForPlatform( + platform_, "clCreateBufferFromImageAMD"); + if (clCreateBufferFromImageAMD == NULL) { + testDescString = "clCreateBufferFromImageAMD not found!\n"; + done = true; + return; + } + + CompileKernel(); + AllocateOpenCLBuffer(); +} + +void OCLBufferFromImage::run(void) { + if (_errorFlag || done) { + return; + } + + if ((_openTest % 2) == 0) { + testReadBuffer(bufferImage); + } else { + testKernel(); + } +} + +void OCLBufferFromImage::AllocateOpenCLBuffer() { + cl_int status = 0; + + size_t size = 0; + pitchAlignment = 0; + status = _wrapper->clGetDeviceInfo(devices_[_deviceId], + CL_DEVICE_IMAGE_PITCH_ALIGNMENT, + sizeof(cl_uint), &pitchAlignment, &size); + pitchAlignment--; + + const unsigned int requiredPitch = + ((imageWidth + pitchAlignment) & ~pitchAlignment); + const unsigned int pitch = requiredPitch; + bufferSize = pitch * imageHeight; + + unsigned char *sourceData = new unsigned char[bufferSize]; + + // init data + for (unsigned int y = 0; y < bufferSize; y++) { + *(sourceData + y) = y; + } + buffer = _wrapper->clCreateBuffer(context_, + CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, + bufferSize, sourceData, &status); + + delete[] sourceData; + + const cl_image_format format = {CL_RGBA, CL_UNSIGNED_INT8}; +#if defined(CL_VERSION_2_0) + const cl_image_desc desc = {CL_MEM_OBJECT_IMAGE2D, + imageWidth / 4, + imageHeight, + 0, + 0, + pitch, + 0, + 0, + 0, + {buffer}}; +#else + const cl_image_desc desc = {CL_MEM_OBJECT_IMAGE2D, + imageWidth / 4, + imageHeight, + 0, + 0, + pitch, + 0, + 0, + 0, + buffer}; +#endif + clImage2D = _wrapper->clCreateImage(context_, CL_MEM_READ_WRITE, &format, + &desc, NULL, &status); + CHECK_RESULT(clImage2D == NULL || status != CL_SUCCESS, + "AllocateOpenCLImage() failed"); + + bufferImage = clCreateBufferFromImageAMD(context_, clImage2D, &status); + char c[1024]; + _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DRIVER_VERSION, sizeof(c), + &c, NULL); + if (status == CL_INVALID_OPERATION) { + testDescString = + "clCreateBufferFromImageAMD not supported on this device!\n"; + done = true; + return; + } + CHECK_RESULT(bufferImage == NULL || status != CL_SUCCESS, + "clCreateBufferFromImage(bufferOut) failed"); + + bufferOut = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, bufferSize, + NULL, &status); + CHECK_RESULT(bufferOut == NULL || status != CL_SUCCESS, + "clCreateBuffer(bufferOut) failed"); +} + +void OCLBufferFromImage::testReadBuffer(cl_mem buffer) { + cl_int status = 0; + unsigned char *dstData = new unsigned char[bufferSize]; + + status = clEnqueueReadBuffer(cmdQueues_[_deviceId], buffer, 1, 0, bufferSize, + dstData, 0, 0, 0); + + ::clFinish(cmdQueues_[_deviceId]); + + for (unsigned int y = 0; y < bufferSize; y++) { + if (*(dstData + y) != (unsigned char)y) { + CHECK_RESULT_NO_RETURN(true, "CheckCLBuffer: *(dstData+y)!=y => %i != %i", + *(dstData + y), y); + goto cleanup; + } + } +cleanup: + + delete[] dstData; +} + +void OCLBufferFromImage::testKernel() { + CopyOpenCLBuffer(bufferImage); + + testReadBuffer(bufferOut); +} + +unsigned int OCLBufferFromImage::close(void) { + if (bufferImage != NULL) clReleaseMemObject(bufferImage); + if (clImage2D != NULL) clReleaseMemObject(clImage2D); + if (buffer != NULL) clReleaseMemObject(buffer); + if (bufferOut != NULL) clReleaseMemObject(bufferOut); + return OCLTestImp::close(); +} + +void OCLBufferFromImage::CopyOpenCLBuffer(cl_mem buffer) { + cl_int status = 0; + + // Set appropriate arguments to the kernel2D + + // input buffer image + status = clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((status != CL_SUCCESS), + "CopyOpenCLBuffer() failed at " + "clSetKernelArg(kernel_,0,sizeof(cl_mem),&buffer)"); + status = clSetKernelArg(kernel_, 1, sizeof(cl_mem), &bufferOut); + CHECK_RESULT((status != CL_SUCCESS), + "CopyOpenCLBuffer() failed at " + "clSetKernelArg(kernel_,1,sizeof(cl_mem),&bufferOut)"); + + // Enqueue a kernel run call. + size_t global_work_offset[] = {0}; + size_t globalThreads[] = {bufferSize}; + size_t localThreads[] = {blockSizeX}; + + status = clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL, + globalThreads, NULL, 0, NULL, 0); + CHECK_RESULT((status != CL_SUCCESS), + "CopyOpenCLBuffer() failed at clEnqueueNDRangeKernel"); + + status = clFinish(cmdQueues_[_deviceId]); + CHECK_RESULT((status != CL_SUCCESS), "CopyOpenCLBuffer() failed at clFinish"); +} + +void OCLBufferFromImage::CompileKernel() { + cl_int status = 0; + + size_t kernelSize = sizeof(strKernel); + const char *strs = (const char *)&strKernel[0]; + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strs, + &kernelSize, &status); + + status = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL, + NULL, NULL); + if (status != CL_SUCCESS) { + if (status == CL_BUILD_PROGRAM_FAILURE) { + cl_int logStatus; + size_t buildLogSize = 0; + logStatus = clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, buildLogSize, + NULL, &buildLogSize); + std::string buildLog; + buildLog.resize(buildLogSize); + + logStatus = clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, buildLogSize, + &buildLog[0], NULL); + printf("%s", buildLog.c_str()); + } + return; + } + // get a kernel object handle for a kernel with the given name + kernel_ = _wrapper->clCreateKernel(program_, "buffer2bufferCopy", &status); + + size_t kernel2DWorkGroupSize = 0; + status = clGetKernelWorkGroupInfo(kernel_, devices_[_deviceId], + CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), + &kernel2DWorkGroupSize, 0); + + if ((blockSizeX * blockSizeY) > kernel2DWorkGroupSize) { + if (blockSizeX > kernel2DWorkGroupSize) { + blockSizeX = kernel2DWorkGroupSize; + blockSizeY = 1; + } + } +} diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLBufferFromImage.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLBufferFromImage.h new file mode 100644 index 0000000000..aeab03b617 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLBufferFromImage.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCLBufferFromImage_H_ +#define _OCLBufferFromImage_H_ + +#include "OCLTestImp.h" + +class OCLBufferFromImage : public OCLTestImp { + public: + OCLBufferFromImage(); + virtual ~OCLBufferFromImage(); + + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId); + virtual void run(void); + virtual unsigned int close(void); + + protected: + static const unsigned int imageWidth = 1920; + static const unsigned int imageHeight = 1080; + + void testReadBuffer(cl_mem buffer); + void testKernel(); + void AllocateOpenCLBuffer(); + void CopyOpenCLBuffer(cl_mem buffer); + void CompileKernel(); + + bool done; + size_t blockSizeX; /**< Work-group size in x-direction */ + size_t blockSizeY; /**< Work-group size in y-direction */ + size_t bufferSize; + cl_mem buffer; + cl_mem clImage2D; + cl_mem bufferImage; + cl_mem bufferOut; + cl_uint pitchAlignment; +}; + +#endif // _OCLBufferFromImage_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLCPUGuardPages.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCPUGuardPages.cpp new file mode 100644 index 0000000000..e4fa6968da --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCPUGuardPages.cpp @@ -0,0 +1,178 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLCPUGuardPages.h" + +#include +#include +#include + +#include "CL/cl.h" +#ifdef _WIN32 +#include +#include // for EXCEPTION_ACCESS_VIOLATION + +int filter(unsigned int code, struct _EXCEPTION_POINTERS* ep) { + printf("In filter\n"); + if (code == EXCEPTION_ACCESS_VIOLATION) { + printf("caught AV as expected."); + return EXCEPTION_EXECUTE_HANDLER; + } else { + printf("didn't catch AV, unexpected."); + return EXCEPTION_CONTINUE_SEARCH; + }; +} + +#else +#include + +#include +#include +#include + +void segfault_sigaction(int signal, siginfo_t *si, void *arg) { + printf("Caught segfault at address %p\n", si->si_addr); + exit(0); +} + +#endif + +const static char* strKernel = + "__kernel void simple_in_out_test( int in_offset, \n" + " int out_offset, \n" + " __global float4* in, \n" + " __global float4* out) { \n" + "unsigned int gid = get_global_id(0);\n" + "out[gid + out_offset] = in[gid + in_offset] * -1.f;" + "}"; + +testOCLCPUGuardPagesStruct testOCLCPUGuardPagesList[] = { + {false, false, 1024, 0, 0}, {true, false, 1024, 0, 0}, + {false, false, 1024, 0, 0}, {true, true, 1024, 0, 0}, + {false, false, 1024, 0, 0}, {true, true, 1024, 0, 0}}; + +OCLCPUGuardPages::OCLCPUGuardPages() { + _numSubTests = + sizeof(testOCLCPUGuardPagesList) / sizeof(testOCLCPUGuardPagesStruct); + + /* + struct sigaction sa; + + memset(&sa, 0, sizeof(sa)); + sigemptyset(&sa.sa_mask); + sa.sa_sigaction = segfault_sigaction; + sa.sa_flags = SA_SIGINFO; + + sigaction(SIGSEGV, &sa, NULL); + */ +} + +OCLCPUGuardPages::~OCLCPUGuardPages() {} + +void OCLCPUGuardPages::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + // Initialize the current test parameters. + testValues = testOCLCPUGuardPagesList[test]; + + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "simple_in_out_test", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + // Create input and output buffers for the test. + cl_mem inBuffer, outBuffer; + cl_float4* dummyIn = new cl_float4[testValues.items]; + for (int i = 0; i < testValues.items; i++) { + dummyIn[i].s[0] = dummyIn[i].s[1] = dummyIn[i].s[2] = dummyIn[i].s[3] = + i * 1.f; + } + inBuffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + testValues.items * sizeof(cl_float4), + NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], inBuffer, 1, 0, + testValues.items * sizeof(cl_float4), + dummyIn, 0, 0, 0); + buffers_.push_back(inBuffer); + + outBuffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + testValues.items * sizeof(cl_float4), + NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(outBuffer); + delete[] dummyIn; +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLCPUGuardPages::run(void) { + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_int), + &testValues.in_offset); + error_ |= _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_int), + &testValues.out_offset); + error_ |= _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_mem), &buffers()[0]); + error_ |= _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_mem), &buffers()[1]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + size_t globalThreads[1]; + globalThreads[0] = testValues.items; + size_t localThreads[1] = {256}; + +#ifdef _WIN32 + // LPTOP_LEVEL_EXCEPTION_FILTER pOriginalFilter = + // SetUnhandledExceptionFilter(MyUnhandledExceptionFilter); + // AddVectoredExceptionHandler(1,MyVectorExceptionFilter); + + try { + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, globalThreads, localThreads, + 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } catch (...) { + printf("exception caught in OCLTest...\n"); + } + +#else + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, globalThreads, localThreads, + 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); +#endif +} + +unsigned int OCLCPUGuardPages::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLCPUGuardPages.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCPUGuardPages.h new file mode 100644 index 0000000000..a90451c4b7 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCPUGuardPages.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_CPU_GUARD_PAGES_H_ +#define _OCL_CPU_GUARD_PAGES_H_ + +#include "OCLTestImp.h" + +typedef struct { + bool useGuardPages; + bool shouldFail; + int items; + int in_offset; + int out_offset; +} testOCLCPUGuardPagesStruct; + +class OCLCPUGuardPages : public OCLTestImp { + public: + OCLCPUGuardPages(); + virtual ~OCLCPUGuardPages(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + testOCLCPUGuardPagesStruct testValues; +}; + +#endif // _OCL_CPU_GUARD_PAGES_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.cpp new file mode 100644 index 0000000000..4c40ace60a --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.cpp @@ -0,0 +1,173 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLCreateBuffer.h" + +#include +#include +#include +#include + +#include +#ifdef ATI_OS_LINUX +#include +#endif + +#include "CL/cl.h" + +const static size_t MaxSubTests = 1; + +OCLCreateBuffer::OCLCreateBuffer() { + _numSubTests = MaxSubTests; + failed_ = false; + maxSize_ = 0; +} + +OCLCreateBuffer::~OCLCreateBuffer() {} + +void OCLCreateBuffer::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + testID_ = test; + + size_t size; + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(cl_ulong), &maxSize_, &size); +//! Workaround out of range issue in Windows 32bit apps +#if defined(_WIN32) && !defined(_WIN64) + static const size_t MaxSizeLimit = 512 * 1024 * 1024; + if (maxSize_ > MaxSizeLimit) { + maxSize_ = MaxSizeLimit; + } +#endif + cl_mem buf = NULL; + buf = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, maxSize_, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + + buffers_.push_back(buf); +} + +void OCLCreateBuffer::run(void) { + CPerfCounter timer; + + cl_uchar pattern = PATTERN; + timer.Reset(); + timer.Start(); + error_ = /*_wrapper->*/ clEnqueueFillBuffer( + cmdQueues_[_deviceId], buffers_[0], &pattern, sizeof(pattern), 0, + maxSize_, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueFillBuffer() failed"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + + size_t maxSteps = maxSize_; +#ifdef ATI_OS_LINUX + long pages = sysconf(_SC_PHYS_PAGES); + long page_size = sysconf(_SC_PAGE_SIZE); + if (maxSteps > (size_t)(pages * page_size / 2)) { + maxSteps = (size_t)pages * page_size / 2; + } +#endif + void *resultBuf = NULL; + ; + while ((resultBuf = malloc(maxSteps)) == NULL) { + maxSteps /= 2; + continue; + } + + checkResult(maxSteps, resultBuf, pattern); + + pattern += 1; + + memset(resultBuf, pattern, maxSteps); + + writeBuffer(maxSteps, resultBuf); + + memset(resultBuf, 0x00, maxSteps); + checkResult(maxSteps, resultBuf, pattern); + + free(resultBuf); + + timer.Stop(); + double sec = timer.GetElapsedTime(); + + _perfInfo = (float)sec * 1000.f; + std::stringstream str; + str << "Max single alloc (size of "; + str << maxSize_; + str << " bytes) "; + + testDescString = str.str(); + str << "Max single read/write (size of "; + str << maxSize_; + str << " bytes) create time (ms):"; + + testDescString = str.str(); +} + +void OCLCreateBuffer::checkResult(size_t maxSteps, void *resultBuf, + cl_uchar pattern) { + size_t startPoint = 0; + while ((startPoint) < maxSize_) { + cl_event ee; + size_t readSize = maxSteps; + if ((startPoint + maxSteps) > maxSize_) { + readSize = maxSize_ - startPoint; + } + error_ = /*wrapper->*/ clEnqueueReadBuffer( + cmdQueues_[_deviceId], buffers_[0], CL_FALSE, startPoint, readSize, + resultBuf, 0, NULL, &ee); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + size_t cnt = 0; + cl_uchar *cc = (cl_uchar *)resultBuf; + for (size_t i = 0; i < readSize; i++) { + if (cc[i] != pattern) { + cnt++; + } + } + if (cnt != 0) { + error_ = -1; + CHECK_RESULT((error_ != CL_SUCCESS), "checkResult() failed"); + break; + } + startPoint += maxSteps; + } +} + +void OCLCreateBuffer::writeBuffer(size_t maxSteps, void *dataBuf) { + size_t startPoint = 0; + while ((startPoint) < maxSize_) { + cl_event ee; + size_t writeSize = maxSteps; + if ((startPoint + maxSteps) > maxSize_) { + writeSize = maxSize_ - startPoint; + } + error_ = /*wrapper->*/ clEnqueueWriteBuffer( + cmdQueues_[_deviceId], buffers_[0], CL_FALSE, startPoint, writeSize, + dataBuf, 0, NULL, &ee); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + startPoint += maxSteps; + } +} + +unsigned int OCLCreateBuffer::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.h new file mode 100644 index 0000000000..7797563753 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_CREATE_BUFFER_H_ +#define _OCL_CREATE_BUFFER_H_ + +#include "OCLTestImp.h" +#define PATTERN 0x20 + +class OCLCreateBuffer : public OCLTestImp { + public: + OCLCreateBuffer(); + virtual ~OCLCreateBuffer(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual void writeBuffer(size_t tmpMaxSize, void* dataBuf); + virtual void checkResult(size_t tmpMaxSize, void* resultBuf, + cl_uchar pattern); + virtual unsigned int close(void); + + private: + bool failed_; + unsigned int testID_; + cl_ulong maxSize_; +}; + +#endif // _OCL_CREATE_BUFFER_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateContext.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateContext.cpp new file mode 100644 index 0000000000..3853eeacf7 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateContext.cpp @@ -0,0 +1,98 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLCreateContext.h" + +#include +#include +#include + +#include "CL/cl.h" + +OCLCreateContext::OCLCreateContext() { _numSubTests = 1; } + +OCLCreateContext::~OCLCreateContext() {} + +void OCLCreateContext::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLCreateContext::run(void) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + + int error = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error != CL_SUCCESS, "clGetPlatformIDs failed"); + for (unsigned i = 0; i < numPlatforms; ++i) { + char pbuf[100]; + error = _wrapper->clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, + sizeof(pbuf), pbuf, NULL); + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + platform = platforms[i]; + break; + } + } + delete platforms; + } + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + /* Get the number of requested devices */ + error = _wrapper->clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, + &num_devices); + CHECK_RESULT(error != CL_SUCCESS, "clGetDeviceIDs failed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error = _wrapper->clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, + devices, NULL); + CHECK_RESULT(error != CL_SUCCESS, "clGetDeviceIDs failed"); + + device = devices[0]; + + cl_context gContext = _wrapper->clCreateContext( + NULL, 1, &device, notify_callback, NULL, &error); + CHECK_RESULT(gContext == 0, "clCreateContext failed"); + + error = _wrapper->clReleaseContext(gContext); + CHECK_RESULT(error != CL_SUCCESS, "clReleaseContext failed"); +} + +unsigned int OCLCreateContext::close(void) { return _crcword; } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateContext.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateContext.h new file mode 100644 index 0000000000..bcff21868a --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateContext.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_CreateContext_H_ +#define _OCL_CreateContext_H_ + +#include "OCLTestImp.h" + +class OCLCreateContext : public OCLTestImp { + public: + OCLCreateContext(); + virtual ~OCLCreateContext(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); +}; + +#endif // _OCL_CreateContext_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateImage.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateImage.cpp new file mode 100644 index 0000000000..d6e385eaa4 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateImage.cpp @@ -0,0 +1,493 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLCreateImage.h" + +#include +#include +#include + +#include +#ifdef ATI_OS_LINUX +#include +#include +#endif + +#include "CL/cl.h" + +const static size_t ImageSize = 4; +const static size_t MaxSubTests = 5; + +const static char *strKernel = + "const sampler_t g_Sampler = CLK_FILTER_LINEAR | \n" + " CLK_ADDRESS_CLAMP_TO_EDGE | \n" + " CLK_NORMALIZED_COORDS_FALSE; \n" + " \n" + "__kernel void linear3D(__read_only image3d_t img3D, __global float4* " + "f4Tata) \n" + "{ \n" + " float4 f4Index = { 2.25f, 1.75f, 0.5f, 0.0f }; \n" + " // copy interpolated data in result buffer \n" + " f4Tata[0] = read_imagef(img3D, g_Sampler, f4Index); \n" + "} \n" + " \n" + "__kernel void linear2D(__read_only image2d_t img2D, __global float4* " + "f4Tata) \n" + "{ \n" + " float2 f2Index = { 2.25f, 1.75f }; \n" + " // copy interpolated data in result buffer \n" + " f4Tata[0] = read_imagef(img2D, g_Sampler, f2Index); \n" + "} \n" + " \n" + "__kernel void linear1DArray(__read_only image1d_array_t img1DA, __global " + "float4* f4Tata) \n" + "{ \n" + " float2 f2Index = { 2.25f, 0 }; \n" + " // copy interpolated data in result buffer \n" + " f4Tata[0] = read_imagef(img1DA, g_Sampler, f2Index); \n" + "} \n" + " \n" + "__kernel void linear2DArray(__read_only image2d_array_t img2DA, __global " + "float4* f4Tata) \n" + "{ \n" + " float4 f4Index = { 2.25f, 1.75f, 0.0f, 0.0f }; \n" + " // copy interpolated data in result buffer \n" + " f4Tata[0] = read_imagef(img2DA, g_Sampler, f4Index); \n" + "} \n" + " \n" + "__kernel void point1DBuffer(__read_only image1d_buffer_t img1DB, __global " + "float4* f4Tata) \n" + "{ \n" + " int index = 2; \n" + " // copy interpolated data in result buffer \n" + " f4Tata[0] = read_imagef(img1DB, index); \n" + "} \n" + " \n"; + +OCLCreateImage::OCLCreateImage() { + _numSubTests = MaxSubTests; + failed_ = false; + ImageSizeX = ImageSize; + ImageSizeY = ImageSize; + ImageSizeZ = ImageSize; +} + +OCLCreateImage::~OCLCreateImage() {} + +void OCLCreateImage::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + testID_ = test; + + cl_bool imageSupport; + size_t size; + for (size_t i = 0; i < deviceCount_; ++i) { + _wrapper->clGetDeviceInfo(devices_[i], CL_DEVICE_IMAGE_SUPPORT, + sizeof(imageSupport), &imageSupport, &size); + if (!imageSupport) { + failed_ = true; + return; + } + } + + cl_ulong max2DWidth; + cl_ulong max2DHeight; + + cl_ulong max3DWidth; + cl_ulong max3DHeight; + cl_ulong max3DDepth; + + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(cl_ulong), &maxSize_, &size); + + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_IMAGE2D_MAX_WIDTH, + sizeof(cl_ulong), &max2DWidth, &size); + + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_IMAGE2D_MAX_HEIGHT, + sizeof(cl_ulong), &max2DHeight, &size); + + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_IMAGE3D_MAX_WIDTH, + sizeof(cl_ulong), &max3DWidth, &size); + + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_IMAGE3D_MAX_HEIGHT, + sizeof(cl_ulong), &max3DHeight, &size); + + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_IMAGE3D_MAX_DEPTH, + sizeof(cl_ulong), &max3DDepth, &size); + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + const char *kernels[] = {"linear3D", "linear2D", "linear2DArray", + "linear1DArray", "point1DBuffer"}; + unsigned int dimensions[] = {3, 2, 3, 2, 1}; + kernel_ = _wrapper->clCreateKernel(program_, kernels[test], &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + cl_mem memory; + cl_mem buf = NULL; + cl_image_desc desc; + size_t offset[3] = {0, 0, 0}; + cl_image_format imageFormat = {CL_RGBA, CL_FLOAT}; + + desc.image_type = CL_MEM_OBJECT_IMAGE3D; + desc.image_array_size = 0; + desc.image_row_pitch = 0; + desc.image_slice_pitch = 0; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = (cl_mem)NULL; + + if (test == 0) { + desc.image_type = CL_MEM_OBJECT_IMAGE3D; + if (is64BitApp()) { + ImageSizeX = max3DWidth; + ImageSizeY = maxSize_ / (ImageSizeX * 16); + if (ImageSizeY > (max3DHeight)) { + ImageSizeY = max3DHeight; + } + ImageSizeZ = maxSize_ / (ImageSizeX * ImageSizeY * 16); + } else { + ImageSizeX = 4; + ImageSizeY = 4; + ImageSizeZ = 4; + } + desc.image_width = ImageSizeX; + desc.image_height = ImageSizeY; + desc.image_depth = ImageSizeZ; + } + if (test == 1) { + desc.image_type = CL_MEM_OBJECT_IMAGE2D; + if (is64BitApp()) { + ImageSizeX = max2DWidth - 0x10; + ImageSizeY = maxSize_ / (ImageSizeX * 16 * 2); + if (ImageSizeY >= max2DHeight) { + ImageSizeY = max2DHeight - 0x1000; + } +#ifdef ATI_OS_LINUX + // On linux, if the size of total system memory is less than 4GB, + // then, we can allocate much smaller image. + // TODO, need to find the root cause + struct sysinfo myinfo; + unsigned long total_bytes; + + sysinfo(&myinfo); + total_bytes = myinfo.mem_unit * myinfo.totalram; + if ((total_bytes / (1024 * 1024)) <= 4096) { + ImageSizeY /= 2; + } +#endif + } else { + ImageSizeX = 4; + ImageSizeY = 4; + } + ImageSizeZ = 0; + desc.image_width = ImageSizeX; + desc.image_height = ImageSizeY; + desc.image_depth = 0; + } else if (test == 2) { + desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY; + ImageSizeX = ImageSize; + ImageSizeY = ImageSize; + ImageSizeZ = ImageSize; + desc.image_width = ImageSizeX; + desc.image_height = ImageSizeY; + desc.image_depth = 0; + desc.image_array_size = ImageSize; + } else if (test == 3) { + desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY; + ImageSizeX = ImageSize; + ImageSizeY = ImageSize; + ImageSizeZ = 0; + desc.image_width = ImageSize; + desc.image_height = ImageSize; + desc.image_depth = 0; + desc.image_array_size = ImageSize; + } else if (test == 4) { + ImageSizeX = ImageSize; + desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + buf = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + ImageSizeX * 4 * sizeof(cl_float), NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + ImageSizeY = 0; + ImageSizeZ = 0; + desc.image_width = ImageSizeX; + desc.image_height = 0; + desc.image_depth = 0; + desc.buffer = buf; + } + + memory = _wrapper->clCreateImage(context_, CL_MEM_READ_ONLY, &imageFormat, + &desc, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateImage() failed"); + + float fillColor[4] = {1.f, 1.f, 1.f, 1.f}; + + if (dimensions[test] == 1) { + float data[4][ImageSize]; + size_t region[3] = {ImageSize, 1, 1}; + + error_ = + _wrapper->clEnqueueFillImage(cmdQueues_[_deviceId], memory, fillColor, + offset, region, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueFillImage() failed"); + error_ = + _wrapper->clEnqueueReadImage(cmdQueues_[_deviceId], memory, true, + offset, region, 0, 0, data, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadImage() failed"); + + for (size_t x = 0; x < ImageSize; ++x) { + if (0 != memcmp(&data[x], fillColor, sizeof(fillColor))) { + CHECK_RESULT(true, "Fill image validation failed"); + } + data[x][0] = (float)x; + data[x][1] = data[x][2] = data[x][3] = 1.0f; + } + error_ = _wrapper->clEnqueueWriteImage(cmdQueues_[_deviceId], memory, true, + offset, region, 0, 0, data, 0, NULL, + NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteImage() failed"); + } else if (dimensions[test] == 2) { + size_t region[3] = {ImageSizeX, ImageSizeY, 1}; + + error_ = + _wrapper->clEnqueueFillImage(cmdQueues_[_deviceId], memory, fillColor, + offset, region, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueFillImage() failed"); + + float *data; + size_t ActualImageSizeY = ImageSizeY; + size_t maxImageSize = maxSize_; +#ifdef ATI_OS_LINUX + long pages = sysconf(_SC_PHYS_PAGES); + long page_size = sysconf(_SC_PAGE_SIZE); + if (maxImageSize > ((size_t)pages * page_size)) { + maxImageSize = ((size_t)pages * page_size); + } +#endif + while ((((ImageSizeX * ActualImageSizeY * sizeof(float) * 4) / + (1024 * 1024)) >= (size_t)4 * 1024) || + ((ImageSizeX * ActualImageSizeY * sizeof(float) * 4) >= + (maxImageSize / 2))) { + if (ActualImageSizeY == 1) { + break; + } + ActualImageSizeY /= 2; + } + while ((data = (float *)malloc(ImageSizeX * ActualImageSizeY * + sizeof(float) * 4)) == NULL) { + if (ActualImageSizeY == 1) { + break; + } + ActualImageSizeY /= 2; + } + if (data == NULL) { + CHECK_RESULT(true, "malloc() failed"); + } + + size_t remainSizeY = ImageSizeY; + while (remainSizeY > 0) { + ActualImageSizeY = + (remainSizeY > ActualImageSizeY) ? ActualImageSizeY : remainSizeY; + size_t tmpRange[3] = {ImageSizeX, ActualImageSizeY, 1}; + error_ = _wrapper->clEnqueueReadImage(cmdQueues_[_deviceId], memory, true, + offset, tmpRange, 0, 0, data, 0, + NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadImage() failed"); + + for (size_t y = 0; y < ActualImageSizeY; ++y) { + for (size_t x = 0; x < ImageSizeX; ++x) { + size_t offsetData = (y * ImageSizeX + x) * 4; + if (0 != memcmp(&data[offsetData], fillColor, sizeof(fillColor))) { + CHECK_RESULT(true, "Fill image validation failed"); + } + data[offsetData + 0] = (float)x; + data[offsetData + 1] = (float)y; + data[offsetData + 2] = data[offsetData + 3] = 1.0f; + } + } + error_ = _wrapper->clEnqueueWriteImage(cmdQueues_[_deviceId], memory, + true, offset, tmpRange, 0, 0, data, + 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteImage() failed"); + remainSizeY -= ActualImageSizeY; + offset[1] += ActualImageSizeY; + } + free(data); + } else if (dimensions[test] == 3) { + float *data; + + float index = 0.f; + size_t region[3] = {ImageSizeX, ImageSizeY, ImageSizeZ}; + error_ = + _wrapper->clEnqueueFillImage(cmdQueues_[_deviceId], memory, fillColor, + offset, region, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueFillImage() failed"); + + size_t ActualImageSizeZ = ImageSizeZ; + size_t maxImageSize = maxSize_; +#ifdef ATI_OS_LINUX + long pages = sysconf(_SC_PHYS_PAGES); + long page_size = sysconf(_SC_PAGE_SIZE); + if (maxImageSize > ((size_t)pages * page_size)) { + maxImageSize = ((size_t)pages * page_size); + } +#endif + while ((((ImageSizeX * ImageSizeY * ActualImageSizeZ * sizeof(float) * 4) / + (1024 * 1024)) >= (size_t)4 * 1024) || + ((ImageSizeX * ImageSizeY * ActualImageSizeZ * sizeof(float) * 4) >= + (maxImageSize / 2))) { + if (ActualImageSizeZ == 1) { + break; + } + ActualImageSizeZ /= 2; + } + while ((data = (float *)malloc(ImageSizeX * ImageSizeY * ActualImageSizeZ * + sizeof(float) * 4)) == NULL) { + if (ActualImageSizeZ == 1) { + break; + } + ActualImageSizeZ -= 1; + } + if (data == NULL) { + CHECK_RESULT(true, "malloc() failed"); + } + + size_t remainSizeZ = ImageSizeZ; + while (remainSizeZ > 0) { + ActualImageSizeZ = + (remainSizeZ > ActualImageSizeZ) ? ActualImageSizeZ : remainSizeZ; + size_t tmpRange[3] = {ImageSizeX, ImageSizeY, ActualImageSizeZ}; + error_ = _wrapper->clEnqueueReadImage(cmdQueues_[_deviceId], memory, true, + offset, tmpRange, 0, 0, data, 0, + NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadImage() failed"); + + for (size_t z = 0; z < ActualImageSizeZ; ++z) { + for (size_t y = 0; y < ImageSizeY; ++y) { + for (size_t x = 0; x < ImageSizeX; ++x) { + size_t offset = (((z * ImageSizeY) + y) * ImageSizeX + x) * 4; + if (0 != memcmp(&data[offset], fillColor, sizeof(fillColor))) { + CHECK_RESULT(true, "Fill image validation failed"); + } + data[offset + 0] = (float)x; + data[offset + 1] = (float)y; + data[offset + 2] = (float)z; + data[offset + 3] = 1.0f; + } + } + } + error_ = _wrapper->clEnqueueWriteImage(cmdQueues_[_deviceId], memory, + true, offset, tmpRange, 0, 0, data, + 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteImage() failed"); + remainSizeZ -= ActualImageSizeZ; + offset[2] += ActualImageSizeZ; + } + free(data); + } + + buffers_.push_back(memory); + + memory = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + 4 * sizeof(cl_float), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(memory); + if (buf != NULL) { + buffers_.push_back(buf); + } + size_t imageSizebyte = + (ImageSizeY != 0) ? ImageSizeY * ImageSizeX : ImageSizeX; + imageSizebyte *= (ImageSizeZ != 0) ? ImageSizeZ : 1; + imageSizebyte *= 16; // 16 bytes per pixel, imageFormat = {CL_RGBA,CL_FLOAT} + char strImgSize[200]; + if (imageSizebyte >= 1024 * 1024) { + sprintf(strImgSize, "%5ld MB", (long)(imageSizebyte / (1024 * 1024))); + } else { + sprintf(strImgSize, "%6ld Bytes", (long)imageSizebyte); + } + std::stringstream str; + str << " ("; + str << ImageSizeX; + str << ", "; + str << ImageSizeY; + str << ", "; + str << ImageSizeZ; + str << ") "; + str << strImgSize; + + testDescString = str.str(); +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLCreateImage::run(void) { + if (failed_) { + return; + } + + cl_float values[4] = {0.f, 0.f, 0.f, 0.f}; + cl_float ref[2] = {1.75f, 1.25f}; + cl_mem image = buffers()[0]; + cl_mem buffer = buffers()[1]; + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &image); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + size_t gws[1] = {0x1}; + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffer, true, 0, + 4 * sizeof(cl_float), values, 0, NULL, + NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); + if (testID_ == 4) { + ref[0] = 2.0f; + } + for (cl_uint i = 0; i < static_cast((testID_ >= 3) ? 1 : 2); ++i) { + if (values[i] != ref[i]) { + printf("%.2f != %.2f [ref]", values[i], ref[i]); + CHECK_RESULT(true, " - Incorrect result for linear filtering!\n"); + } + } +} + +unsigned int OCLCreateImage::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateImage.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateImage.h new file mode 100644 index 0000000000..95347a5f6c --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLCreateImage.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_CREATE_IMAGE_H_ +#define _OCL_CREATE_IMAGE_H_ + +#include "OCLTestImp.h" + +class OCLCreateImage : public OCLTestImp { + public: + OCLCreateImage(); + virtual ~OCLCreateImage(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; + unsigned int testID_; + size_t maxSize_; + size_t ImageSizeX; + size_t ImageSizeY; + size_t ImageSizeZ; + + bool is64BitApp() { return sizeof(int*) == 8; } +}; + +#endif // _OCL_CREATE_IMAGE_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceAtomic.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceAtomic.cpp new file mode 100644 index 0000000000..7d5a94aedb --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceAtomic.cpp @@ -0,0 +1,210 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLDeviceAtomic.h" + +#include +#include +#include + +#include "CL/cl.h" + +static const cl_uint TotalElements = 256 * 1024 * 1024; +static const cl_uint ArraySize = 256; +static cl_uint hostArray[ArraySize]; + +#define KERNEL_CODE(...) #__VA_ARGS__ + +const static char* strKernel[] = { + KERNEL_CODE( + \n __kernel void atomic_test1(__global uint* res) { + __global atomic_uint* inc = (__global atomic_uint*)res; + atomic_fetch_add_explicit(inc, 1, memory_order_acq_rel, + memory_scope_device); + } + \n __kernel void atomic_test2(__global uint* res) { + __global atomic_uint* inc = (__global atomic_uint*)res; + atomic_fetch_add_explicit(inc, 1, memory_order_acq_rel, + memory_scope_device); + } + \n), + KERNEL_CODE( + \n __kernel void atomic_test1(__global uint* res) { + for (uint i = 0; i < 256 * 1024; ++i) { + for (uint j = 0; j < 256; ++j) { + __global atomic_uint* inc = (__global atomic_uint*)&res[j]; + uint val = atomic_load_explicit(inc, memory_order_acquire, + memory_scope_device); + if (0 != val) { + res[1] = get_global_id(0); + res[2] = i; + return; + } + } + } + } + \n __kernel void atomic_test2(__global uint* res) { + if (get_global_id(0) == 64 * 1000 * 1000) { + __global atomic_uint* inc = (__global atomic_uint*)res; + // atomic_fetch_add_explicit(inc, 1, memory_order_acq_rel, + // memory_scope_device); + atomic_store_explicit(inc, get_global_id(0), memory_order_release, + memory_scope_device); + } + } + \n)}; + +OCLDeviceAtomic::OCLDeviceAtomic() + : hostQueue_(NULL), failed_(false), kernel2_(NULL) { + _numSubTests = 2; +} + +OCLDeviceAtomic::~OCLDeviceAtomic() {} + +void OCLDeviceAtomic::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + testID_ = test; + size_t param_size = 0; + char* strVersion = 0; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0, + 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strVersion = new char[param_size]; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, + param_size, strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (strVersion[7] < '2') { + failed_ = true; + return; + } + delete strVersion; + + char dbuffer[1024] = {0}; + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel[test], + NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "atomic_test1", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + kernel2_ = _wrapper->clCreateKernel(program_, "atomic_test2", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + cl_mem buffer; + memset(hostArray, 0, sizeof(hostArray)); + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_COPY_HOST_PTR, + sizeof(hostArray), &hostArray, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + +#if defined(CL_VERSION_2_0) + const cl_queue_properties cprops[] = {CL_QUEUE_PROPERTIES, + static_cast(0), 0}; + hostQueue_ = _wrapper->clCreateCommandQueueWithProperties( + context_, devices_[deviceId], cprops, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateCommandQueueWithProperties() failed"); +#endif +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLDeviceAtomic::run(void) { + if (failed_) return; + cl_mem buffer = buffers()[0]; + + size_t gws[1] = {TotalElements}; + size_t gws2[1] = {1}; + size_t gws3[1] = {TotalElements}; + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + if (testID_ == 0) { + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } else { + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws2, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + + error_ = _wrapper->clSetKernelArg(kernel2_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + if (testID_ == 0) { + error_ = _wrapper->clEnqueueNDRangeKernel(hostQueue_, kernel2_, 1, NULL, + gws, NULL, 0, NULL, NULL); + } else { + error_ = _wrapper->clEnqueueNDRangeKernel(hostQueue_, kernel2_, 1, NULL, + gws3, NULL, 0, NULL, NULL); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + _wrapper->clFlush(cmdQueues_[_deviceId]); + _wrapper->clFlush(hostQueue_); + + _wrapper->clFinish(cmdQueues_[_deviceId]); + _wrapper->clFinish(hostQueue_); + + error_ = _wrapper->clEnqueueReadBuffer(hostQueue_, buffer, CL_TRUE, 0, + sizeof(hostArray), hostArray, 0, NULL, + NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); + + if (testID_ == 0) { + if (hostArray[0] != 2 * TotalElements) { + printf("Counter: %d, expected: %d\n", hostArray[0], 2 * TotalElements); + CHECK_RESULT(true, "Incorrect result for device atomic inc!\n"); + } + } else { + printf("Value: %d, thread: %d, iter: %d\n", hostArray[0], hostArray[1], + hostArray[2]); + if (hostArray[0] == 0) { + CHECK_RESULT(true, "Incorrect result for device atomic inc!\n"); + } + } +} + +unsigned int OCLDeviceAtomic::close(void) { + if (NULL != hostQueue_) { + _wrapper->clReleaseCommandQueue(hostQueue_); + } + if (NULL != kernel2_) { + _wrapper->clReleaseKernel(kernel2_); + } + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceAtomic.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceAtomic.h new file mode 100644 index 0000000000..7bb69ef1a1 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceAtomic.h @@ -0,0 +1,44 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_DEVICE_ATOMIC_H_ +#define _OCL_DEVICE_ATOMIC_H_ + +#include "OCLTestImp.h" + +class OCLDeviceAtomic : public OCLTestImp { + public: + OCLDeviceAtomic(); + virtual ~OCLDeviceAtomic(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + cl_command_queue hostQueue_; + bool failed_; + cl_kernel kernel2_; + unsigned int testID_; +}; + +#endif // _OCL_DEVICE_ATOMIC_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.cpp new file mode 100644 index 0000000000..b233cb41cb --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.cpp @@ -0,0 +1,288 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLDeviceQueries.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "CL/cl_ext.h" + +struct AMDDeviceInfo { + const char* targetName_; //!< Target name + const char* machineTarget_; //!< Machine target + cl_uint simdPerCU_; //!< Number of SIMDs per CU + cl_uint simdWidth_; //!< Number of workitems processed per SIMD + cl_uint simdInstructionWidth_; //!< Number of instructions processed per SIMD + cl_uint memChannelBankWidth_; //!< Memory channel bank width + cl_uint localMemSizePerCU_; //!< Local memory size per CU + cl_uint localMemBanks_; //!< Number of banks of local memory + cl_uint gfxipMajor_; //!< GFXIP major number + cl_uint gfxipMinor_; //!< GFXIP minor number +}; + +static const cl_uint Ki = 1024; +static const AMDDeviceInfo DeviceInfo[] = { + // targetName machineTarget + /* CAL_TARGET_600 */ {"", "", 0, 0, 0, 0, 0, 0, 0, 0}, + /* CAL_TARGET_610 */ {"", "", 0, 0, 0, 0, 0, 0, 0, 0}, + /* CAL_TARGET_630 */ {"", "", 0, 0, 0, 0, 0, 0, 0, 0}, + /* CAL_TARGET_670 */ {"", "", 0, 0, 0, 0, 0, 0, 0, 0}, + /* CAL_TARGET_7XX */ {"", "", 0, 0, 0, 0, 0, 0, 0, 0}, + /* CAL_TARGET_770 */ {"", "", 0, 0, 0, 0, 0, 0, 0, 0}, + /* CAL_TARGET_710 */ {"", "", 0, 0, 0, 0, 0, 0, 0, 0}, + /* CAL_TARGET_730 */ {"", "", 0, 0, 0, 0, 0, 0, 0, 0}, + /* CAL_TARGET_CYPRESS */ + {"Cypress", "cypress", 1, 16, 5, 256, 32 * Ki, 32, 4, 0}, + /* CAL_TARGET_JUNIPER */ + {"Juniper", "juniper", 1, 16, 5, 256, 32 * Ki, 32, 4, 0}, + /* CAL_TARGET_REDWOOD */ + {"Redwood", "redwood", 1, 16, 5, 256, 32 * Ki, 16, 4, 0}, + /* CAL_TARGET_CEDAR */ {"Cedar", "cedar", 1, 8, 5, 256, 32 * Ki, 16, 4, 0}, + /* CAL_TARGET_SUMO */ + {"WinterPark", "redwood", 1, 16, 5, 256, 32 * Ki, 16, 4, 0}, + /* CAL_TARGET_SUPERSUMO*/ + {"BeaverCreek", "redwood", 1, 16, 5, 256, 32 * Ki, 16, 4, 0}, + /* CAL_TARGET_WRESTLER*/ + {"Loveland", "cedar", 1, 8, 5, 256, 32 * Ki, 16, 4, 0}, + /* CAL_TARGET_CAYMAN */ + {"Cayman", "cayman", 1, 16, 4, 256, 32 * Ki, 32, 5, 0}, + /* CAL_TARGET_KAUAI */ {"", "", 1, 16, 5, 256, 32 * Ki, 32, 4, 0}, + /* CAL_TARGET_BARTS */ {"Barts", "barts", 1, 16, 5, 256, 32 * Ki, 32, 4, 0}, + /* CAL_TARGET_TURKS */ {"Turks", "turks", 1, 16, 5, 256, 32 * Ki, 32, 4, 0}, + /* CAL_TARGET_CAICOS */ + {"Caicos", "caicos", 1, 16, 5, 256, 32 * Ki, 32, 4, 0}, + /* CAL_TARGET_TAHITI */ + {"Tahiti", "tahiti", 4, 16, 1, 256, 64 * Ki, 32, 6, 0}, + /* CAL_TARGET_PITCAIRN */ + {"Pitcairn", "pitcairn", 4, 16, 1, 256, 64 * Ki, 32, 6, 0}, + /* CAL_TARGET_CAPEVERDE */ + {"Capeverde", "capeverde", 4, 16, 1, 256, 64 * Ki, 32, 6, 0}, + /* CAL_TARGET_DEVASTATOR */ + {"Devastator", "trinity", 1, 16, 4, 256, 32 * Ki, 32, 5, 0}, + /* CAL_TARGET_SCRAPPER */ + {"Scrapper", "trinity", 1, 16, 4, 256, 32 * Ki, 32, 5, 0}, + /* CAL_TARGET_OLAND */ {"Oland", "oland", 4, 16, 1, 256, 64 * Ki, 32, 6, 0}, + /* CAL_TARGET_BONAIRE */ + {"Bonaire", "bonaire", 4, 16, 1, 256, 64 * Ki, 32, 7, 2}, + /* CAL_TARGET_SPECTRE */ + {"Spectre", "spectre", 4, 16, 1, 256, 64 * Ki, 32, 7, 1}, + /* CAL_TARGET_SPOOKY */ + {"Spooky", "spooky", 4, 16, 1, 256, 64 * Ki, 32, 7, 1}, + /* CAL_TARGET_KALINDI */ + {"Kalindi", "kalindi", 4, 16, 1, 256, 64 * Ki, 32, 7, 2}, + /* CAL_TARGET_HAINAN */ + {"Hainan", "hainan", 4, 16, 1, 256, 64 * Ki, 32, 6, 0}, + /* CAL_TARGET_HAWAII */ + {"Hawaii", "hawaii", 4, 16, 1, 256, 64 * Ki, 32, 7, 2}, + /* CAL_TARGET_ICELAND */ + {"Iceland", "iceland", 4, 16, 1, 256, 64 * Ki, 32, 8, 0}, + /* CAL_TARGET_TONGA */ {"Tonga", "tonga", 4, 16, 1, 256, 64 * Ki, 32, 8, 0}, + /* CAL_TARGET_MULLINS */ + {"Mullins", "mullins", 4, 16, 1, 256, 64 * Ki, 32, 7, 2}, + /* CAL_TARGET_FIJI */ {"Fiji", "fiji", 4, 16, 1, 256, 64 * Ki, 32, 8, 0}, + /* CAL_TARGET_CARRIZO */ + {"Carrizo", "carrizo", 4, 16, 1, 256, 64 * Ki, 32, 8, 0}, + /* CAL_TARGET_CARRIZO */ + {"Bristol Ridge", "carrizo", 4, 16, 1, 256, 64 * Ki, 32, 8, 0}, + /* CAL_TARGET_Ellesmere */ + {"Ellesmere", "ellesmere", 4, 16, 1, 256, 64 * Ki, 32, 8, 0}, + /* CAL_TARGET_BAFFIN */ + {"Baffin", "baffin", 4, 16, 1, 256, 64 * Ki, 32, 8, 0}, + /* ROCM Kaveri */ {"gfx700", "gfx700", 4, 16, 1, 256, 64 * Ki, 32, 7, 1}, + /* ROCM Hawaii */ {"gfx701", "gfx701", 4, 16, 1, 256, 64 * Ki, 32, 7, 2}, + /* ROCM Kabini */ {"gfx703", "gfx703", 4, 16, 1, 256, 64 * Ki, 32, 7, 2}, + /* ROCM Iceland */ {"gfx800", "gfx800", 4, 16, 1, 256, 64 * Ki, 32, 8, 0}, + /* ROCM Carrizo */ {"gfx801", "gfx801", 4, 16, 1, 256, 64 * Ki, 32, 8, 0}, + /* ROCM Tonga */ {"gfx802", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 8, 0}, + /* ROCM Fiji */ {"gfx803", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 8, 0}, + /* Vega10 */ {"gfx900", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 9, 0}, + /* CAL_TARGET_STONEY */ + {"Stoney", "stoney", 4, 16, 1, 256, 64 * Ki, 32, 8, 0}, + /* CAL_TARGET_LEXA */ + {"gfx804", "gfx804", 4, 16, 1, 256, 64 * Ki, 32, 8, 0}, + /* Vega10_XNACK */ {"gfx901", "gfx901", 4, 16, 1, 256, 64 * Ki, 32, 9, 0}, + /* Raven */ {"gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 9, 0}, + /* ROCM Raven_XNACK */ + {"gfx902-xnack", "gfx902-xnack", 4, 16, 1, 256, 64 * Ki, 32, 9, 0}, + /* Raven_XNACK */ {"gfx903", "gfx903", 4, 16, 1, 256, 64 * Ki, 32, 9, 0}, + /* Vega12 */ {"gfx904", "gfx904", 4, 16, 1, 256, 64 * Ki, 32, 9, 0}, + /* Vega12_XNACK */ {"gfx905", "gfx905", 4, 16, 1, 256, 64 * Ki, 32, 9, 0}, + /* Vega20 */ {"gfx906", "gfx906", 4, 16, 1, 256, 64 * Ki, 32, 9, 0}, + /* Vega20 */ + {"gfx906+sram-ecc", "gfx906+sram-ecc", 4, 16, 1, 256, 64 * Ki, 32, 9, 0}, + /* Vega20_XNACK */ {"gfx907", "gfx907", 4, 16, 1, 256, 64 * Ki, 32, 9, 0}, + /* MI100 */ {"gfx908", "gfx908", 4, 16, 1, 256, 64 * Ki, 32, 9, 0}, + /* MI100 */ + {"gfx908+sram-ecc", "gfx908+sram-ecc", 4, 16, 1, 256, 64 * Ki, 32, 9, 0}, + /* Navi10 */ {"gfx1010", "gfx1010", 4, 32, 1, 256, 64 * Ki, 32, 10, 1}, + /* Navi12 */ {"gfx1011", "gfx1011", 4, 32, 1, 256, 64 * Ki, 32, 10, 1}, + /* Navi14 */ {"gfx1012", "gfx1012", 4, 32, 1, 256, 64 * Ki, 32, 10, 1}, +}; + +const int DeviceInfoSize = sizeof(DeviceInfo) / sizeof(AMDDeviceInfo); + +OCLDeviceQueries::OCLDeviceQueries() { + _numSubTests = 1; + failed_ = false; +} + +OCLDeviceQueries::~OCLDeviceQueries() {} + +void OCLDeviceQueries::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + char name[1024] = {0}; + size_t size = 0; + + if (deviceId >= deviceCount_) { + failed_ = true; + return; + } + cl_uint value; + cl_device_type deviceType; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE, + sizeof(deviceType), &deviceType, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed"); + + if (!(deviceType & CL_DEVICE_TYPE_GPU)) { + printf("GPU device is required for this test!\n"); + failed_ = true; + return; + } + + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 1024, + name, &size); + if (!strstr(name, "cl_amd_device_attribute_query")) { + printf("AMD device attribute extension is required for this test!\n"); + failed_ = true; + return; + } + + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_NAME, + sizeof(name), name, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_NAME failed"); + + std::string str = name; + int id = 0; + bool deviceFound = false; + for (int i = 0; i < DeviceInfoSize; ++i) { + if (0 == str.compare(DeviceInfo[i].targetName_)) { + deviceFound = true; + id = i; + break; + } + } + CHECK_RESULT(deviceFound != true, "Device %s is not supported", name); + + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], + CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, + sizeof(cl_uint), &value, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), + "CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD failed"); + CHECK_RESULT((value != DeviceInfo[id].simdPerCU_), + "CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD failed"); + + error_ = + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_SIMD_WIDTH_AMD, + sizeof(cl_uint), &value, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_SIMD_WIDTH_AMD failed"); + CHECK_RESULT((value != DeviceInfo[id].simdWidth_), + "CL_DEVICE_SIMD_WIDTH_AMD failed"); + + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], + CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, + sizeof(cl_uint), &value, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), + "CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD failed"); + CHECK_RESULT((value != DeviceInfo[id].simdInstructionWidth_), + "CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD failed"); + + error_ = _wrapper->clGetDeviceInfo( + devices_[deviceId], CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, + sizeof(cl_uint), &value, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), + "CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD failed"); + CHECK_RESULT((value != DeviceInfo[id].memChannelBankWidth_), + "CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD failed"); + + error_ = _wrapper->clGetDeviceInfo( + devices_[deviceId], CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, + sizeof(cl_uint), &value, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), + "CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD failed"); + CHECK_RESULT((value != DeviceInfo[id].localMemSizePerCU_), + "CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD failed"); + + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], + CL_DEVICE_LOCAL_MEM_BANKS_AMD, + sizeof(cl_uint), &value, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_LOCAL_MEM_BANKS_AMD failed"); + CHECK_RESULT((value != DeviceInfo[id].localMemBanks_), + "CL_DEVICE_LOCAL_MEM_BANKS_AMD failed"); + + error_ = + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_GFXIP_MAJOR_AMD, + sizeof(cl_uint), &value, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_GFXIP_MAJOR_AMD failed"); + CHECK_RESULT((value != DeviceInfo[id].gfxipMajor_), + "CL_DEVICE_GFXIP_MAJOR_AMD failed"); + + error_ = + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_GFXIP_MINOR_AMD, + sizeof(cl_uint), &value, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_GFXIP_MINOR_AMD failed"); + + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], + CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, + sizeof(cl_uint), &value, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), + "CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD failed"); + CHECK_RESULT((value == 0), "CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD failed"); + + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], + CL_DEVICE_WAVEFRONT_WIDTH_AMD, + sizeof(cl_uint), &value, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_WAVEFRONT_WIDTH_AMD failed"); + CHECK_RESULT((value == 0), "CL_DEVICE_WAVEFRONT_WIDTH_AMD failed"); + + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], + CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, + sizeof(cl_uint), &value, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), + "CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD failed"); + CHECK_RESULT((value == 0), "CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD failed"); +} + +static void CL_CALLBACK notify_callback(cl_event event, + cl_int event_command_exec_status, + void* user_data) {} + +void OCLDeviceQueries::run(void) { + if (failed_) { + return; + } +} + +unsigned int OCLDeviceQueries::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.h new file mode 100644 index 0000000000..db6896a6f7 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_DEVICE_QUERIES_H_ +#define _OCL_DEVICE_QUERIES_H_ + +#include "OCLTestImp.h" + +class OCLDeviceQueries : public OCLTestImp { + public: + OCLDeviceQueries(); + virtual ~OCLDeviceQueries(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; +}; + +#endif // _OCL_DEVICE_QUERIES_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamic.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamic.cpp new file mode 100644 index 0000000000..372919ad13 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamic.cpp @@ -0,0 +1,225 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLDynamic.h" + +#include +#include +#include + +#include "CL/cl.h" + +static const cl_uint TotalElements = 128; +static cl_uint hostArray[TotalElements]; + +#define KERNEL_CODE(...) #__VA_ARGS__ + +const static char* strKernel[] = { + KERNEL_CODE( + \n void block_fn(int tid, int mul, __global uint* res) { + res[tid] = mul * 7 - 21; + } + + __kernel void dynamic(__global uint* res) { + int multiplier = 3; + int tid = get_global_id(0); + + void (^kernelBlock)(void) = ^{ + block_fn(tid, multiplier, res); + }; + + res[tid] = -1; + queue_t def_q = get_default_queue(); + ndrange_t ndrange = ndrange_1D(1); + int enq_res; + do { + enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, + kernelBlock); + if (enq_res != 0 /*CL_SUCCESS*/) { + res[tid] = -2; + } + } while (enq_res != 0); + } + \n), + KERNEL_CODE( + \n void block_fn(int tid, int mul, __global uint* res) { + res[tid] = mul * 7 - 21; + } + + __kernel void dynamic(__global uint* res, queue_t def_q) { + int multiplier = 3; + int tid = get_global_id(0); + + void (^kernelBlock)(void) = ^{ + block_fn(tid, multiplier, res); + }; + + res[tid] = -1; + ndrange_t ndrange = ndrange_1D(1); + // if (tid == 0) { + int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, + ndrange, kernelBlock); + if (enq_res != 0 /*CL_SUCCESS*/) { + res[tid] = -2; + return; + } + //} + } + \n)}; + +OCLDynamic::OCLDynamic() { + _numSubTests = 2; + deviceQueue_ = NULL; + failed_ = false; +} + +OCLDynamic::~OCLDynamic() {} + +void OCLDynamic::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + // FIXME: Re-enable CPU test once bug 10143 is fixed. + if (type_ == CL_DEVICE_TYPE_CPU) { + return; + } + + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + testID_ = test; + + size_t param_size = 0; + char* strVersion = 0; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0, + 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strVersion = new char[param_size]; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, + param_size, strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (strVersion[7] < '2') { + failed_ = true; + return; + } + delete strVersion; + + char dbuffer[1024] = {0}; + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel[test], + NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "dynamic", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + cl_mem buffer; + memset(hostArray, 0xee, sizeof(hostArray)); + buffer = _wrapper->clCreateBuffer( + context_, CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR, sizeof(hostArray), + &hostArray, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + cl_uint queueSize = (test == 0) ? 1 : 257 * 1024; + +#if defined(CL_VERSION_2_0) + const cl_queue_properties cprops[] = { + CL_QUEUE_PROPERTIES, + static_cast(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | + CL_QUEUE_ON_DEVICE_DEFAULT | + CL_QUEUE_ON_DEVICE), + CL_QUEUE_SIZE, queueSize, 0}; + deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties( + context_, devices_[deviceId], cprops, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateCommandQueueWithProperties() failed"); +#endif +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLDynamic::run(void) { + // FIXME: Re-enable CPU test once bug 10143 is fixed. + if (type_ == CL_DEVICE_TYPE_CPU) { + return; + } + + if (failed_) return; + cl_mem buffer = buffers()[0]; + + size_t gws[1] = {TotalElements}; + size_t lws[1] = {16}; + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + if (testID_ == 1) { + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_command_queue), + &deviceQueue_); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + } + + size_t offset = 0; + size_t region = TotalElements * sizeof(cl_uint); + + cl_uint* host = reinterpret_cast(_wrapper->clEnqueueMapBuffer( + cmdQueues_[_deviceId], buffer, CL_TRUE, (CL_MAP_READ | CL_MAP_WRITE), + offset, region, 0, NULL, NULL, &error_)); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueMapBuffer() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + _wrapper->clFinish(cmdQueues_[_deviceId]); + + for (unsigned int i = 0; i < TotalElements; ++i) { + if (host[i] != 0) { + printf("Bad value: a[%d] = %d\n", i, hostArray[i]); + CHECK_RESULT(true, "Incorrect result for dependency!\n"); + } + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], buffer, + host, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueUnmapBuffer() failed"); + + _wrapper->clFinish(cmdQueues_[_deviceId]); +} + +unsigned int OCLDynamic::close(void) { + // FIXME: Re-enable CPU test once bug 10143 is fixed. + if (type_ == CL_DEVICE_TYPE_CPU) { + return 0; + } + + if (NULL != deviceQueue_) { + _wrapper->clReleaseCommandQueue(deviceQueue_); + } + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamic.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamic.h new file mode 100644 index 0000000000..f75a40e0cb --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamic.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_DYNAMIC_H_ +#define _OCL_DYNAMIC_H_ + +#include "OCLTestImp.h" + +class OCLDynamic : public OCLTestImp { + public: + OCLDynamic(); + virtual ~OCLDynamic(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + cl_command_queue deviceQueue_; + bool failed_; + unsigned int testID_; +}; + +#endif // _OCL_MEM_DEPENDENCY_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamicBLines.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamicBLines.cpp new file mode 100644 index 0000000000..0170ee4a84 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamicBLines.cpp @@ -0,0 +1,357 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLDynamicBLines.h" + +#include +#include +#include +#include + +#include "CL/cl.h" + +const static cl_int nLines = 2048; +const static cl_int blockDim = 64; +#define MAX_TESSELLATION 64 + +#define KERNEL_CODE(...) #__VA_ARGS__ + +const static char* strKernel[] = +{ + KERNEL_CODE( + \n + \x23 define MAX_TESSELLATION 64 + \n + struct BezierLine + { + float2 CP[3]; + ulong vertexPos; + int nVertices; + int reserved; + }; + \n + __kernel + void computeBezierLinePositions(int lidx, __global struct BezierLine* bLines, + int nTessPoints, __global char* buf) + { + int idx = get_global_id(0); + if (idx < nTessPoints) { + float u = (float)idx / (float)(nTessPoints-1); + float omu = 1.0f - u; + + float B3u[3]; + + B3u[0] = omu * omu; + B3u[1] = 2.0f * u * omu; + B3u[2] = u * u; + + float2 position = {0, 0}; + + for (int i = 0; i < 3; i++) { + position = position + B3u[i] * bLines[lidx].CP[i]; + } + + ((__global float2*)(bLines[lidx].vertexPos))[idx] = position; + } + } + \n + __kernel + void computeBezierLines(__global struct BezierLine* bLines, int nLines, __global char* buf) + { + int lidx = get_global_id(0); + + if (lidx < nLines) { + float curvature = length(bLines[lidx].CP[1] - 0.5f * (bLines[lidx].CP[0] + bLines[lidx].CP[2])) / + length(bLines[lidx].CP[2] - bLines[lidx].CP[0]); + int nTessPoints = min(max((int)(curvature * 16.0f), 4), MAX_TESSELLATION); + + if (bLines[lidx].vertexPos == 0) { + bLines[lidx].nVertices = nTessPoints; + uint value = atomic_add((__global volatile uint*)buf, + nTessPoints * sizeof(float2)); + bLines[lidx].vertexPos = (ulong)(&buf[value]); + } + + queue_t def_q = get_default_queue(); + ndrange_t ndrange = ndrange_1D(bLines[lidx].nVertices, 64); + + int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, + ^{ computeBezierLinePositions(lidx, bLines, bLines[lidx].nVertices, buf); }); + } + } + \n + __kernel + void computeBezierLines2(__global struct BezierLine* bLines, int nLines, __global char* buf) + { + int lidx = get_global_id(0); + + if (lidx < nLines) { + float curvature = length(bLines[lidx].CP[1] - 0.5f * (bLines[lidx].CP[0] + bLines[lidx].CP[2])) / + length(bLines[lidx].CP[2] - bLines[lidx].CP[0]); + int nTessPoints = min(max((int)(curvature * 16.0f), 4), MAX_TESSELLATION); + + if (bLines[lidx].vertexPos == 0) { + bLines[lidx].nVertices = nTessPoints; + uint value = atomic_add((__global volatile uint*)buf, + nTessPoints * sizeof(float2)); + bLines[lidx].vertexPos = (ulong)(&buf[value]); + } + } + } + \n + ) +}; + +OCLDynamicBLines::OCLDynamicBLines() { + _numSubTests = 1; + deviceQueue_ = NULL; + failed_ = false; + bLines_ = NULL; + hostArray_ = NULL; + kernel2_ = NULL; + kernel3_ = NULL; +} + +OCLDynamicBLines::~OCLDynamicBLines() {} + +void OCLDynamicBLines::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + if (type_ == CL_DEVICE_TYPE_CPU) { + return; + } + + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + testID_ = test; + + size_t param_size = 0; + char* strVersion = 0; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0, + 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strVersion = new char[param_size]; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, + param_size, strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (strVersion[7] < '2') { + failed_ = true; + return; + } + delete strVersion; + + char dbuffer[1024] = {0}; + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel[test], + NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "computeBezierLines", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + kernel2_ = _wrapper->clCreateKernel(program_, "computeBezierLines2", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + kernel3_ = + _wrapper->clCreateKernel(program_, "computeBezierLinePositions", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + cl_mem buffer; + bLines_ = new BezierLine[nLines]; + + cl_float2 last = {0, 0}; + for (int i = 0; i < nLines; i++) { + bLines_[i].CP[0] = last; + + for (int j = 1; j < 3; j++) { + bLines_[i].CP[j].s[0] = (float)rand() / (float)RAND_MAX; + bLines_[i].CP[j].s[1] = (float)rand() / (float)RAND_MAX; + } + + last = bLines_[i].CP[2]; + bLines_[i].vertexPos = 0; + bLines_[i].nVertices = 0; + bLines_[i].reserved = 0; + } + + buffer = + _wrapper->clCreateBuffer(context_, CL_MEM_USE_HOST_PTR, + sizeof(BezierLine) * nLines, bLines_, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + + hostArray_ = new cl_float2[nLines * (MAX_TESSELLATION + 1)]; + ((unsigned int*)hostArray_)[0] = sizeof(cl_float2); + buffer = _wrapper->clCreateBuffer( + context_, CL_MEM_USE_HOST_PTR, + sizeof(cl_float2) * nLines * MAX_TESSELLATION, hostArray_, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + + cl_uint queueSize = 256 * 1024; +#if defined(CL_VERSION_2_0) + const cl_queue_properties cprops[] = { + CL_QUEUE_PROPERTIES, + static_cast(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | + CL_QUEUE_ON_DEVICE_DEFAULT | + CL_QUEUE_ON_DEVICE), + CL_QUEUE_SIZE, queueSize, 0}; + deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties( + context_, devices_[deviceId], cprops, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateCommandQueueWithProperties() failed"); +#endif +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLDynamicBLines::run(void) { + CPerfCounter timer; + if (type_ == CL_DEVICE_TYPE_CPU) { + return; + } + + if (failed_) return; + + cl_mem buffer = buffers()[0]; + cl_mem alloc = buffers()[1]; + + size_t gws[1] = {nLines}; + size_t lws[1] = {blockDim}; + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + error_ |= _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_int), &nLines); + error_ |= _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_mem), &alloc); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + _wrapper->clFinish(cmdQueues_[_deviceId]); + + for (int i = 0; i < nLines; i++) { + bLines_[i].vertexPos = 0; + bLines_[i].nVertices = 0; + bLines_[i].reserved = 0; + } + ((unsigned int*)hostArray_)[0] = sizeof(cl_float2); + + timer.Reset(); + timer.Start(); + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + timer.Stop(); + double sec = timer.GetElapsedTime(); + + for (int i = 0; i < nLines; i++) { + bLines_[i].vertexPos = 0; + bLines_[i].nVertices = 0; + bLines_[i].reserved = 0; + } + unsigned int allocSize = ((unsigned int*)hostArray_)[0]; + ((unsigned int*)hostArray_)[0] = sizeof(cl_float2); + + // + // Host emulation + // + timer.Reset(); + timer.Start(); + // Step 1. Fill the jobs + error_ = _wrapper->clSetKernelArg(kernel2_, 0, sizeof(cl_mem), &buffer); + error_ |= _wrapper->clSetKernelArg(kernel2_, 1, sizeof(cl_int), &nLines); + error_ |= _wrapper->clSetKernelArg(kernel2_, 2, sizeof(cl_mem), &alloc); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel2_, 1, + NULL, gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + _wrapper->clFinish(cmdQueues_[_deviceId]); + + // Step 2. Run all jobs + for (int lidx = 0; lidx < nLines; lidx++) { + // Readback the new dimension. + error_ = _wrapper->clSetKernelArg(kernel3_, 0, sizeof(cl_int), &lidx); + error_ |= _wrapper->clSetKernelArg(kernel3_, 1, sizeof(cl_mem), &buffer); + error_ |= _wrapper->clSetKernelArg(kernel3_, 2, sizeof(cl_int), + &bLines_[lidx].nVertices); + error_ |= _wrapper->clSetKernelArg(kernel3_, 3, sizeof(cl_mem), &alloc); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + size_t gwsL[1] = {static_cast(bLines_[lidx].nVertices)}; + size_t lwsL[1] = {blockDim}; + + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel3_, + 1, NULL, gws, lws, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + + _wrapper->clFinish(cmdQueues_[_deviceId]); + timer.Stop(); + double sec2 = timer.GetElapsedTime(); + + if (memcmp(&allocSize, hostArray_, sizeof(cl_uint)) != 0) { + CHECK_RESULT(true, "Validaiton failed!"); + } + + if (sec >= sec2) { + _perfInfo = (float)(sec2 - sec); + CHECK_RESULT(true, "Device enqueue is slower than emulation (sec)"); + return; + } + + _perfInfo = (float)(((sec2 - sec) / sec) * 100); + testDescString = "Device enqueue is (%%) faster"; +} + +unsigned int OCLDynamicBLines::close(void) { + // FIXME: Re-enable CPU test once bug 10143 is fixed. + if (type_ == CL_DEVICE_TYPE_CPU) { + return 0; + } + + delete[] bLines_; + delete[] hostArray_; + + if (NULL != deviceQueue_) { + _wrapper->clReleaseCommandQueue(deviceQueue_); + } + if (NULL != kernel2_) { + _wrapper->clReleaseKernel(kernel2_); + } + if (NULL != kernel3_) { + _wrapper->clReleaseKernel(kernel3_); + } + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamicBLines.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamicBLines.h new file mode 100644 index 0000000000..bbb9386c4b --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDynamicBLines.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_DYNAMIC_BLINES_H_ +#define _OCL_DYNAMIC_BLINES_H_ + +#include "OCLTestImp.h" + +class OCLDynamicBLines : public OCLTestImp { + public: + OCLDynamicBLines(); + virtual ~OCLDynamicBLines(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + struct BezierLine { + cl_float2 CP[3]; + long long vertexPos; + int nVertices; + int reserved; + }; + + cl_command_queue deviceQueue_; + bool failed_; + unsigned int testID_; + BezierLine* bLines_; + cl_float2* hostArray_; + cl_kernel kernel2_; + cl_kernel kernel3_; +}; + +#endif // _OCL_DYNAMIC_BLINES__H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLGenericAddressSpace.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGenericAddressSpace.cpp new file mode 100644 index 0000000000..fd09049132 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGenericAddressSpace.cpp @@ -0,0 +1,815 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLGenericAddressSpace.h" + +#include "CL/cl.h" + +#define TO_LOCAL_FAIL 0x000f0 +#define TO_GLOBAL_FAIL 0x00e00 +#define TO_PRIVATE_FAIL 0x0d000 +#define WRONG_VALUE 0xc0000 + +OCLGenericAddressSpace::OCLGenericAddressSpace() { _numSubTests = 7; } + +OCLGenericAddressSpace::~OCLGenericAddressSpace() {} + +void OCLGenericAddressSpace::open(unsigned int test, char* units, + double& conversion, unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "error_ opening test"); + silentFailure = false; + _openTest = test; + size_t param_size = 0; + program_ = 0; + kernel_ = 0; + char* strVersion = 0; + arrSize = 1000; + error_ = _wrapper->clGetDeviceInfo( + devices_[_deviceId], CL_DEVICE_OPENCL_C_VERSION, 0, 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed"); + strVersion = (char*)malloc(param_size); + error_ = + _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_OPENCL_C_VERSION, + param_size, strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed"); + if (strVersion[9] < '2') { + printf("\nOpenCL C 2.0 not supported\n"); + silentFailure = true; + } + free(strVersion); +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLGenericAddressSpace::run(void) { + if (silentFailure) return; + switch (_openTest) { + case 0: + test0(); + break; + case 1: + test1(); + break; + case 2: + test2(); + break; + case 3: + test3(); + break; + case 4: + test4(); + break; + case 5: + test5(); + break; + case 6: + test6(); + break; + } + return; +} + +void OCLGenericAddressSpace::test6(void) { + const char* kernel_str = + "\n\ + __global unsigned int gint = 1; \n\ + __kernel void test(__global ulong *results) \n\ + { \n\ + uint tid = get_global_id(0); \n\ + unsigned int *ptr; \n\ + __private unsigned int pint = tid + 2; \n\ + if ((tid % 2) == 0) { \n\ + ptr = &pint; \n\ + } \n\ + else { \n\ + ptr = &gint; \n\ + } \n\ + results[0] = *ptr;\n\ + results[1] = pint;\n\ + results[2] = ptr;\n\ + results[3] = to_private(ptr);\n\ + results[4] = &pint;\n\ + } \n"; + const size_t global_work_size = 1; + const size_t arrSize = global_work_size * 5; + cl_ulong* output_arr = (cl_ulong*)malloc(arrSize * sizeof(cl_ulong)); + memset(output_arr, 0, arrSize * sizeof(cl_ulong)); + cl_mem buffer = _wrapper->clCreateBuffer( + context_, CL_MEM_READ_WRITE, arrSize * sizeof(cl_ulong), 0, &error_); + buffers_.push_back(buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed"); + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed"); + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char log[400]; + _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, 400, log, 0); + printf("\n\n%s\n\n", log); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed"); + kernel_ = _wrapper->clCreateKernel(program_, "test", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel failed"); + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*)&buffers_[0]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed"); + cl_event evt; + + error_ = + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL, + &global_work_size, NULL, 0, NULL, &evt); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0], + CL_TRUE, 0, sizeof(cl_ulong) * arrSize, + output_arr, 1, &evt, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed"); + if (output_arr[0] != 2) { + printf( + "\n*ptr:0x%llx, pint:0x%llx, ptr:0x%llx, to_private(ptr):0x%llx, " + "&pint:0x%llx", + (unsigned long long)output_arr[0], (unsigned long long)output_arr[1], + (unsigned long long)output_arr[2], (unsigned long long)output_arr[3], + (unsigned long long)output_arr[4]); + printf("\n\n"); + error_ = 1; + } + free(output_arr); + CHECK_RESULT((error_ != CL_SUCCESS), "Generic Address Space - test2 failed"); +} + +void OCLGenericAddressSpace::test5(void) { + const char* kernel_str = + "\n\ + __global unsigned int gint = 1; \n\ + __kernel void test(__global ulong *results) \n\ + { \n\ + uint tid = get_global_id(0); \n\ + results[tid] = 0; \n\ + unsigned int *ptr; \n\ + __local unsigned int lint; \n\ + lint = 2; \n\ + if ((tid % 2) == 0) { \n\ + ptr = &lint; \n\ + } \n\ + else { \n\ + ptr = &gint; \n\ + } \n\ + barrier(CLK_GLOBAL_MEM_FENCE); \n\ + if ((tid % 2) == 0) { \n\ + results[tid*5] = *ptr;\n\ + results[tid*5+1] = lint;\n\ + results[tid*5+2] = ptr;\n\ + results[tid*5+3] = to_local(ptr);\n\ + results[tid*5+4] = &lint;\n\ + } \n\ + else { \n\ + results[tid*5] = *ptr;\n\ + results[tid*5+1] = gint;\n\ + results[tid*5+2] = ptr;\n\ + results[tid*5+3] = to_global(ptr);\n\ + results[tid*5+4] = &gint;\n\ + } \n\ + } \n"; + const size_t global_work_size = 2; + const size_t arrSize = global_work_size * 5; + cl_ulong* output_arr = (cl_ulong*)malloc(arrSize * sizeof(cl_ulong)); + memset(output_arr, 0, arrSize * sizeof(cl_ulong)); + cl_mem buffer = _wrapper->clCreateBuffer( + context_, CL_MEM_READ_WRITE, arrSize * sizeof(cl_ulong), 0, &error_); + buffers_.push_back(buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed"); + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed"); + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char log[400]; + _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, 400, log, 0); + printf("\n\n%s\n\n", log); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed"); + kernel_ = _wrapper->clCreateKernel(program_, "test", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel failed"); + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*)&buffers_[0]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed"); + cl_event evt; + + error_ = + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL, + &global_work_size, NULL, 0, NULL, &evt); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0], + CL_TRUE, 0, sizeof(cl_ulong) * arrSize, + output_arr, 1, &evt, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed"); + int error_cnt = 0; + for (unsigned int i = 0; i < global_work_size; ++i) { + if (((i % 2 == 0) && (output_arr[i * 5] != 2)) || + ((i % 2 == 1) && (output_arr[i * 5] != 1))) { + ++error_cnt; + } + } + if (error_cnt) { + printf("\nNumber of wrong results: %d/%d\n\n", error_cnt, + (int)global_work_size); + for (unsigned int i = 0; i < global_work_size; ++i) { + if (i % 2 == 0) { + printf( + "\n*ptr:0x%llx, lint:0x%llx, ptr:0x%llx, to_local(ptr):0x%llx, " + "&lint:0x%llx", + (unsigned long long)output_arr[i * 5], + (unsigned long long)output_arr[i * 5 + 1], + (unsigned long long)output_arr[i * 5 + 2], + (unsigned long long)output_arr[i * 5 + 3], + (unsigned long long)output_arr[i * 5 + 4]); + } else { + printf( + "\n*ptr:0x%llx, gint:0x%llx, ptr:0x%llx, to_global(ptr):0x%llx, " + "&gint:0x%llx", + (unsigned long long)output_arr[i * 5], + (unsigned long long)output_arr[i * 5 + 1], + (unsigned long long)output_arr[i * 5 + 2], + (unsigned long long)output_arr[i * 5 + 3], + (unsigned long long)output_arr[i * 5 + 4]); + } + } + printf("\n\n"); + } + free(output_arr); + CHECK_RESULT((error_cnt != 0), "Generic Address Space - test2 failed"); +} + +void OCLGenericAddressSpace::test4(void) { + const char* kernel_str = + "\n\ + __global unsigned int gint = 1; \n\ + __kernel void test(__global ulong *results) \n\ + { \n\ + uint tid = get_global_id(0); \n\ + results[tid] = 0; \n\ + unsigned int *ptr; \n\ + __private unsigned int pint = 2; \n\ + if ((tid % 2) == 0) { \n\ + ptr = &pint; \n\ + } \n\ + else { \n\ + ptr = &gint; \n\ + } \n\ + barrier(CLK_GLOBAL_MEM_FENCE); \n\ + if ((tid % 2) == 0) { \n\ + results[tid*5] = *ptr;\n\ + results[tid*5+1] = pint;\n\ + results[tid*5+2] = ptr;\n\ + results[tid*5+3] = to_private(ptr);\n\ + results[tid*5+4] = &pint;\n\ + } \n\ + else { \n\ + results[tid*5] = *ptr;\n\ + results[tid*5+1] = gint;\n\ + results[tid*5+2] = ptr;\n\ + results[tid*5+3] = to_global(ptr);\n\ + results[tid*5+4] = &gint;\n\ + } \n\ + } \n"; + const size_t global_work_size = 2; + const size_t arrSize = global_work_size * 5; + cl_ulong* output_arr = (cl_ulong*)malloc(arrSize * sizeof(cl_ulong)); + memset(output_arr, 0, arrSize * sizeof(cl_ulong)); + cl_mem buffer = _wrapper->clCreateBuffer( + context_, CL_MEM_READ_WRITE, arrSize * sizeof(cl_ulong), 0, &error_); + buffers_.push_back(buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed"); + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed"); + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char log[400]; + _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, 400, log, 0); + printf("\n\n%s\n\n", log); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed"); + kernel_ = _wrapper->clCreateKernel(program_, "test", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel failed"); + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*)&buffers_[0]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed"); + cl_event evt; + + error_ = + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL, + &global_work_size, NULL, 0, NULL, &evt); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0], + CL_TRUE, 0, sizeof(cl_ulong) * arrSize, + output_arr, 1, &evt, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed"); + int error_cnt = 0; + for (unsigned int i = 0; i < global_work_size; ++i) { + if (((i % 2 == 0) && (output_arr[i * 5] != 2)) || + ((i % 2 == 1) && (output_arr[i * 5] != 1))) { + ++error_cnt; + } + } + if (error_cnt) { + printf("\nNumber of wrong results: %d/%d\n\n", error_cnt, + (int)global_work_size); + for (unsigned int i = 0; i < global_work_size; ++i) { + if (i % 2 == 0) { + printf( + "\n*ptr:0x%llx, pint:0x%llx, ptr:0x%llx, to_private(ptr):0x%llx, " + "&pint:0x%llx", + (unsigned long long)output_arr[i * 5], + (unsigned long long)output_arr[i * 5 + 1], + (unsigned long long)output_arr[i * 5 + 2], + (unsigned long long)output_arr[i * 5 + 3], + (unsigned long long)output_arr[i * 5 + 4]); + } else { + printf( + "\n*ptr:0x%llx, gint:0x%llx, ptr:0x%llx, to_global(ptr):0x%llx, " + "&gint:0x%llx", + (unsigned long long)output_arr[i * 5], + (unsigned long long)output_arr[i * 5 + 1], + (unsigned long long)output_arr[i * 5 + 2], + (unsigned long long)output_arr[i * 5 + 3], + (unsigned long long)output_arr[i * 5 + 4]); + } + } + printf("\n\n"); + } + free(output_arr); + CHECK_RESULT((error_cnt != 0), "Generic Address Space - test2 failed"); +} + +void OCLGenericAddressSpace::test3(void) { + const char* kernel_str = + "\n\ + #define TO_LOCAL_FAIL 0x000f0\n\ + #define TO_GLOBAL_FAIL 0x00e00\n\ + #define TO_PRIVATE_FAIL 0x0d000\n\ + #define WRONG_VALUE 0xc0000\n\ + __global unsigned int gint = 1; \n\ + __kernel void test(__global uint *results) \n\ + { \n\ + uint tid = get_global_id(0); \n\ + results[tid] = 0; \n\ + unsigned int *ptr; \n\ + __local unsigned int lint; \n\ + lint = 2; \n\ + __private unsigned int pint = 3; \n\ + switch (tid % 3) \n\ + {\n\ + case 0:\n\ + ptr = &gint; break; \n\ + case 1:\n\ + ptr = &lint; break; \n\ + case 2:\n\ + ptr = &pint; break; \n\ + }\n\ + barrier(CLK_GLOBAL_MEM_FENCE); \n\ + switch (tid % 3) \n\ + {\n\ + case 0:\n\ + if(to_global(ptr) && (*ptr == 1))\n\ + {\n\ + results[tid] = *ptr;\n\ + }\n\ + else\n\ + {\n\ + if (*ptr != 1) results[tid] = WRONG_VALUE;\n\ + if(!to_global(ptr)) results[tid] |= TO_GLOBAL_FAIL;\n\ + }\n\ + break; \n\ + case 1:\n\ + if(to_local(ptr) && (*ptr == 2))\n\ + {\n\ + results[tid] = *ptr;\n\ + }\n\ + else\n\ + {\n\ + if (*ptr != 2) results[tid] = WRONG_VALUE;\n\ + if(!to_local(ptr)) results[tid] |= TO_LOCAL_FAIL;\n\ + }\n\ + break; \n\ + case 2:\n\ + if(to_private(ptr) && (*ptr == 3))\n\ + {\n\ + results[tid] = *ptr;\n\ + }\n\ + else\n\ + {\n\ + if (*ptr != 3) results[tid] = WRONG_VALUE;\n\ + if(!to_private(ptr)) results[tid] |= TO_PRIVATE_FAIL;\n\ + }\n\ + break; \n\ + }\n\ + } \n"; + cl_uint* output_arr = (cl_uint*)malloc(arrSize * sizeof(cl_uint)); + memset(output_arr, 0, arrSize * sizeof(cl_uint)); + cl_mem buffer = _wrapper->clCreateBuffer( + context_, CL_MEM_READ_WRITE, arrSize * sizeof(cl_uint), 0, &error_); + buffers_.push_back(buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed"); + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed"); + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char log[400]; + _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, 400, log, 0); + printf("\n\n%s\n\n", log); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed"); + kernel_ = _wrapper->clCreateKernel(program_, "test", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel failed"); + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*)&buffers_[0]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed"); + cl_event evt; + size_t global_work_size = arrSize; + error_ = + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL, + &global_work_size, NULL, 0, NULL, &evt); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0], + CL_TRUE, 0, sizeof(cl_uint) * arrSize, + output_arr, 1, &evt, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed"); + int error_cnt = 0; + int wrong_values = 0; + int to_local_error = 0; + int to_global_error = 0; + int to_private_error = 0; + for (unsigned int i = 0; i < arrSize; ++i) { + switch (i % 3) { + case 0: + error_cnt += (output_arr[i] != 1); + break; + case 1: + error_cnt += (output_arr[i] != 2); + break; + case 2: + error_cnt += (output_arr[i] != 3); + break; + } + if (output_arr[i] & WRONG_VALUE) ++wrong_values; + if (output_arr[i] & TO_LOCAL_FAIL) ++to_local_error; + if (output_arr[i] & TO_GLOBAL_FAIL) ++to_global_error; + if (output_arr[i] & TO_PRIVATE_FAIL) ++to_private_error; + } + if (error_cnt) { + printf("\nNumber of wrong results: %d/%d ", error_cnt, (int)arrSize); + printf( + "wrong values: %d to_local_error: %d, to_global_error: %d, " + "to_private_error: %d\n", + wrong_values, to_local_error, to_global_error, to_private_error); + } + free(output_arr); + CHECK_RESULT((error_cnt != 0), "Generic Address Space - test3 failed"); +} + +void OCLGenericAddressSpace::test2(void) { + const char* kernel_str = + "\n\ + #define TO_LOCAL_FAIL 0x000f0\n\ + #define TO_GLOBAL_FAIL 0x00e00\n\ + #define TO_PRIVATE_FAIL 0x0d000\n\ + #define WRONG_VALUE 0xc0000\n\ + __global unsigned int gint = 1; \n\ + __kernel void test(__global uint *results) \n\ + { \n\ + uint tid = get_global_id(0); \n\ + results[tid] = 0; \n\ + unsigned int *ptr; \n\ + __private unsigned int pint = 2; \n\ + if ((tid % 2) == 0) { \n\ + ptr = &pint; \n\ + } \n\ + else { \n\ + ptr = &gint; \n\ + } \n\ + barrier(CLK_GLOBAL_MEM_FENCE); \n\ + if ((tid % 2) == 0) { \n\ + if (to_private(ptr) && *ptr == 2) {\n\ + results[tid] = *ptr;\n\ + }\n\ + else {\n\ + if (*ptr != 2) results[tid] = WRONG_VALUE;\n\ + if(!to_private(ptr)) results[tid] |= TO_PRIVATE_FAIL;\n\ + }\n\ + } \n\ + else { \n\ + if (to_global(ptr) && *ptr == 1) {\n\ + results[tid] = *ptr;\n\ + }\n\ + else {\n\ + if (*ptr != 1) results[tid] = WRONG_VALUE;\n\ + if(!to_global(ptr)) results[tid] |= TO_GLOBAL_FAIL;\n\ + }\n\ + } \n\ + } \n"; + cl_uint* output_arr = (cl_uint*)malloc(arrSize * sizeof(cl_uint)); + memset(output_arr, 0, arrSize * sizeof(cl_uint)); + cl_mem buffer = _wrapper->clCreateBuffer( + context_, CL_MEM_READ_WRITE, arrSize * sizeof(cl_uint), 0, &error_); + buffers_.push_back(buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed"); + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed"); + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char log[400]; + _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, 400, log, 0); + printf("\n\n%s\n\n", log); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed"); + kernel_ = _wrapper->clCreateKernel(program_, "test", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel failed"); + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*)&buffers_[0]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed"); + cl_event evt; + size_t global_work_size = arrSize; + error_ = + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL, + &global_work_size, NULL, 0, NULL, &evt); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0], + CL_TRUE, 0, sizeof(cl_uint) * arrSize, + output_arr, 1, &evt, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed"); + int error_cnt = 0; + int wrong_values = 0; + int to_local_error = 0; + int to_global_error = 0; + int to_private_error = 0; + + for (unsigned int i = 0; i < arrSize; ++i) { + if (((i % 2 == 0) && (output_arr[i] != 2)) || + ((i % 2 == 1) && (output_arr[i] != 1))) { + if (output_arr[i] & WRONG_VALUE) ++wrong_values; + if (output_arr[i] & TO_LOCAL_FAIL) ++to_local_error; + if (output_arr[i] & TO_GLOBAL_FAIL) ++to_global_error; + if (output_arr[i] & TO_PRIVATE_FAIL) ++to_private_error; + ++error_cnt; + } + } + free(output_arr); + if (error_cnt) { + printf("\nNumber of wrong results: %d/%d", error_cnt, (int)arrSize); + printf( + "wrong values: %d to_local_error: %d, to_global_error: %d, " + "to_private_error: %d\n", + wrong_values, to_local_error, to_global_error, to_private_error); + } + CHECK_RESULT((error_cnt != 0), "Generic Address Space - test2 failed"); +} + +void OCLGenericAddressSpace::test1(void) { + const char* kernel_str = + "\n\ + #define TO_LOCAL_FAIL 0x000f0\n\ + #define TO_GLOBAL_FAIL 0x00e00\n\ + #define TO_PRIVATE_FAIL 0x0d000\n\ + #define WRONG_VALUE 0xc0000\n\ + __global unsigned int gint1 = 1; \n\ + __global unsigned int gint2 = 2; \n\ + __kernel void test(__global uint *results) \n\ + { \n\ + uint tid = get_global_id(0); \n\ + results[tid] = 0; \n\ + unsigned int *ptr; \n\ + if ((tid % 2) == 0) { \n\ + ptr = &gint2; \n\ + } \n\ + else { \n\ + ptr = &gint1; \n\ + } \n\ + barrier(CLK_GLOBAL_MEM_FENCE); \n\ + if ((tid % 2) == 0) { \n\ + if (to_global(ptr) && *ptr == 2) {\n\ + results[tid] = *ptr;\n\ + }\n\ + else {\n\ + if (*ptr != 2) results[tid] = WRONG_VALUE;\n\ + if(!to_global(ptr)) results[tid] |= TO_GLOBAL_FAIL;\n\ + }\n\ + } \n\ + else { \n\ + if (to_global(ptr) && *ptr == 1) {\n\ + results[tid] = *ptr;\n\ + }\n\ + else {\n\ + if (*ptr != 1) results[tid] = WRONG_VALUE;\n\ + if(!to_global(ptr)) results[tid] |= TO_GLOBAL_FAIL;\n\ + }\n\ + } \n\ + } \n"; + cl_uint* output_arr = (cl_uint*)malloc(arrSize * sizeof(cl_uint)); + memset(output_arr, 0, arrSize * sizeof(cl_uint)); + cl_mem buffer = _wrapper->clCreateBuffer( + context_, CL_MEM_READ_WRITE, arrSize * sizeof(cl_uint), 0, &error_); + buffers_.push_back(buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed"); + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed"); + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char log[400]; + _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, 400, log, 0); + printf("\n\n%s\n\n", log); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed"); + kernel_ = _wrapper->clCreateKernel(program_, "test", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel failed"); + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*)&buffers_[0]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed"); + cl_event evt; + size_t global_work_size = arrSize; + error_ = + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL, + &global_work_size, NULL, 0, NULL, &evt); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0], + CL_TRUE, 0, sizeof(cl_uint) * arrSize, + output_arr, 1, &evt, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed"); + int error_cnt = 0; + int wrong_values = 0; + int to_local_error = 0; + int to_global_error = 0; + int to_private_error = 0; + + for (unsigned int i = 0; i < arrSize; ++i) { + if (((i % 2 == 0) && (output_arr[i] != 2)) || + ((i % 2 == 1) && (output_arr[i] != 1))) { + if (output_arr[i] & WRONG_VALUE) ++wrong_values; + if (output_arr[i] & TO_LOCAL_FAIL) ++to_local_error; + if (output_arr[i] & TO_GLOBAL_FAIL) ++to_global_error; + if (output_arr[i] & TO_PRIVATE_FAIL) ++to_private_error; + ++error_cnt; + } + } + free(output_arr); + if (error_cnt) { + printf("\nNumber of wrong results: %d/%d", error_cnt, (int)arrSize); + printf( + "wrong values: %d to_local_error: %d, to_global_error: %d, " + "to_private_error: %d\n", + wrong_values, to_local_error, to_global_error, to_private_error); + } + CHECK_RESULT((error_cnt != 0), "Generic Address Space - test1 failed"); +} + +void OCLGenericAddressSpace::test0(void) { + const char* kernel_str = + "\n\ + #define TO_LOCAL_FAIL 0x000f0\n\ + #define TO_GLOBAL_FAIL 0x00e00\n\ + #define TO_PRIVATE_FAIL 0x0d000\n\ + #define WRONG_VALUE 0xc0000\n\ + __global unsigned int gint = 1; \n\ + __kernel void test(__global uint *results) \n\ + { \n\ + uint tid = get_global_id(0); \n\ + results[tid] = 0; \n\ + unsigned int *ptr; \n\ + __local unsigned int lint; \n\ + lint = 2; \n\ + if ((tid % 2) == 0) { \n\ + ptr = &lint; \n\ + } \n\ + else { \n\ + ptr = &gint; \n\ + } \n\ + barrier(CLK_GLOBAL_MEM_FENCE); \n\ + if ((tid % 2) == 0) { \n\ + if (to_local(ptr) && *ptr == 2) {\n\ + results[tid] = *ptr;\n\ + }\n\ + else {\n\ + if (*ptr != 2) results[tid] = WRONG_VALUE;\n\ + if(!to_local(ptr)) results[tid] |= TO_LOCAL_FAIL;\n\ + }\n\ + } \n\ + else { \n\ + if (to_global(ptr) && *ptr == 1) {\n\ + results[tid] = *ptr;\n\ + }\n\ + else {\n\ + if (*ptr != 1) results[tid] = WRONG_VALUE;\n\ + if(!to_global(ptr)) results[tid] |= TO_GLOBAL_FAIL;\n\ + }\n\ + } \n\ + } \n"; + cl_uint* output_arr = (cl_uint*)malloc(arrSize * sizeof(cl_uint)); + memset(output_arr, 0, arrSize * sizeof(cl_uint)); + cl_mem buffer = _wrapper->clCreateBuffer( + context_, CL_MEM_READ_WRITE, arrSize * sizeof(cl_uint), 0, &error_); + buffers_.push_back(buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed"); + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed"); + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char log[400]; + _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, 400, log, 0); + printf("\n\n%s\n\n", log); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed"); + kernel_ = _wrapper->clCreateKernel(program_, "test", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel failed"); + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*)&buffers_[0]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed"); + cl_event evt; + size_t global_work_size = arrSize; + error_ = + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL, + &global_work_size, NULL, 0, NULL, &evt); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0], + CL_TRUE, 0, sizeof(cl_uint) * arrSize, + output_arr, 1, &evt, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed"); + int error_cnt = 0; + int wrong_values = 0; + int to_local_error = 0; + int to_global_error = 0; + int to_private_error = 0; + + for (unsigned int i = 0; i < arrSize; ++i) { + if (((i % 2 == 0) && (output_arr[i] != 2)) || + ((i % 2 == 1) && (output_arr[i] != 1))) { + if (output_arr[i] & WRONG_VALUE) ++wrong_values; + if (output_arr[i] & TO_LOCAL_FAIL) ++to_local_error; + if (output_arr[i] & TO_GLOBAL_FAIL) ++to_global_error; + if (output_arr[i] & TO_PRIVATE_FAIL) ++to_private_error; + ++error_cnt; + } + } + free(output_arr); + if (error_cnt) { + printf("\nNumber of wrong results: %d/%d", error_cnt, (int)arrSize); + printf( + "wrong values: %d to_local_error: %d, to_global_error: %d, " + "to_private_error: %d\n", + wrong_values, to_local_error, to_global_error, to_private_error); + } + CHECK_RESULT((error_cnt != 0), "Generic Address Space - test0 failed"); +} + +unsigned int OCLGenericAddressSpace::close(void) { + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + kernel_ = 0; + } + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLGenericAddressSpace.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGenericAddressSpace.h new file mode 100644 index 0000000000..56aa104f61 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGenericAddressSpace.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_GenericAddressSpace_H_ +#define _OCL_GenericAddressSpace_H_ + +#include "OCLTestImp.h" + +class OCLGenericAddressSpace : public OCLTestImp { + public: + OCLGenericAddressSpace(); + virtual ~OCLGenericAddressSpace(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + void test0(void); + void test1(void); + void test2(void); + void test3(void); + void test4(void); + void test5(void); + void test6(void); + bool silentFailure; + cl_kernel kernel_; + size_t arrSize; +}; + +#endif // _OCL_GenericAddressSpace_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLGetQueueThreadID.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGetQueueThreadID.cpp new file mode 100644 index 0000000000..68c5968537 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGetQueueThreadID.cpp @@ -0,0 +1,116 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLGetQueueThreadID.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "CL/cl_ext.h" + +#if !defined(ATI_OS_LINUX) +#include "WinBase.h" +typedef DWORD(WINAPI* GetThreadId)(__in HANDLE Thread); +#endif +bool badThread = false; + +OCLGetQueueThreadID::OCLGetQueueThreadID() { + _numSubTests = 1; + failed_ = false; +} + +OCLGetQueueThreadID::~OCLGetQueueThreadID() {} + +void OCLGetQueueThreadID::open(unsigned int test, char* units, + double& conversion, unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + char name[1024] = {0}; + size_t size = 0; + + if (deviceId >= deviceCount_) { + failed_ = true; + return; + } + + cl_mem buffer; + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + sizeof(cl_uint), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); +} + +static void CL_CALLBACK notify_callback(cl_event event, + cl_int event_command_exec_status, + void* user_data) { +#if defined(ATI_OS_LINUX) + pthread_t id = (pthread_t)user_data; + pthread_t handle = pthread_self(); +#else + HMODULE module = GetModuleHandle("kernel32.dll"); + GetThreadId getThreadId = + reinterpret_cast(GetProcAddress(module, "GetThreadId")); + if (NULL == getThreadId) { + return; + } + DWORD id = getThreadId((HANDLE)user_data); + DWORD handle = GetCurrentThreadId(); +#endif + if (id != handle) { + badThread = true; + } +} + +void OCLGetQueueThreadID::run(void) { + if (failed_) { + return; + } + void* handle; + cl_event clEvent; + cl_event userEvent = clCreateUserEvent(context_, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateUserEvent() failed"); + + cl_uint initVal[2] = {5, 10}; + error_ = _wrapper->clGetCommandQueueInfo(cmdQueues_[_deviceId], + CL_QUEUE_THREAD_HANDLE_AMD, + sizeof(void*), &handle, NULL); + error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], buffers()[0], + false, 0, sizeof(cl_uint), + &initVal[0], 1, &userEvent, &clEvent); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed"); + + error_ = _wrapper->clSetEventCallback(clEvent, CL_SUBMITTED, notify_callback, + handle); + + clSetUserEventStatus(userEvent, CL_COMPLETE); + + clFinish(cmdQueues_[_deviceId]); + + clReleaseEvent(clEvent); + + clReleaseEvent(userEvent); + + CHECK_RESULT(badThread, "Thread ID is incorrect!"); +} + +unsigned int OCLGetQueueThreadID::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLGetQueueThreadID.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGetQueueThreadID.h new file mode 100644 index 0000000000..56a373218f --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGetQueueThreadID.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_GET_QUEUE_THREAD_ID_H_ +#define _OCL_GET_QUEUE_THREAD_ID_H_ + +#include "OCLTestImp.h" + +class OCLGetQueueThreadID : public OCLTestImp { + public: + OCLGetQueueThreadID(); + virtual ~OCLGetQueueThreadID(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; +}; + +#endif // _OCL_GET_QUEUE_THREAD_ID_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLGlobalOffset.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGlobalOffset.cpp new file mode 100644 index 0000000000..efcf482e87 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGlobalOffset.cpp @@ -0,0 +1,126 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLGlobalOffset.h" + +#include +#include +#include + +#include "CL/cl.h" + +const static cl_uint ThreadsForCheck = 2; +const static cl_uint GlobalOffset = 64; + +const static char* strKernel = + "__kernel void global_offset_test( \n" + " global uint* out_val) \n" + "{ \n" + " // Check the first thread \n" + " if (get_global_id(0) == get_global_offset(0)) { \n" + " out_val[0] = (uint)get_global_offset(0); \n" + " } \n" + " // Check the last thread \n" + " if (get_global_id(0) == (get_global_size(0) + get_global_offset(0) - " + "1)) { \n" + " out_val[1] = (uint)get_global_offset(0); \n" + " } \n" + "} \n"; + +OCLGlobalOffset::OCLGlobalOffset() { _numSubTests = 1; } + +OCLGlobalOffset::~OCLGlobalOffset() {} + +void OCLGlobalOffset::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + char dbuffer[1024] = {0}; + _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_VERSION, 1024, dbuffer, + NULL); + if (strstr(dbuffer, "OpenCL 1.0")) { + return; + } + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "global_offset_test", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + cl_mem buffer; + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + ThreadsForCheck * sizeof(cl_uint), NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLGlobalOffset::run(void) { + char dbuffer[1024] = {0}; + _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 1024, + dbuffer, NULL); + if (strstr(dbuffer, "OpenCL 1.0")) { + return; + } + cl_uint offsetValues[ThreadsForCheck] = {0xffffffff, 0xffffffff}; + cl_mem buffer = buffers()[0]; + error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], buffer, true, + 0, ThreadsForCheck * sizeof(cl_uint), + offsetValues, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + size_t gws[1] = {0x0800000}; + size_t gwo[1] = {GlobalOffset}; + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + gwo, gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffer, true, 0, + ThreadsForCheck * sizeof(cl_uint), + offsetValues, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); + for (cl_uint i = 0; i < ThreadsForCheck; ++i) { + if (offsetValues[i] != GlobalOffset) { + printf("%d != %d", GlobalOffset, offsetValues[i]); + CHECK_RESULT(true, " - Incorrect result for global offset!\n"); + } + } +} + +unsigned int OCLGlobalOffset::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLGlobalOffset.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGlobalOffset.h new file mode 100644 index 0000000000..0363e514a4 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLGlobalOffset.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_GLOBAL_OFFSET_H_ +#define _OCL_GLOBAL_OFFSET_H_ + +#include "OCLTestImp.h" + +class OCLGlobalOffset : public OCLTestImp { + public: + OCLGlobalOffset(); + virtual ~OCLGlobalOffset(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); +}; + +#endif // _OCL_GLOBAL_OFFSET_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLImage2DFromBuffer.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLImage2DFromBuffer.cpp new file mode 100644 index 0000000000..afeb0a49f0 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLImage2DFromBuffer.cpp @@ -0,0 +1,389 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLImage2DFromBuffer.h" + +#include +#include +#include +#include + +#define GROUP_SIZE 256 +const unsigned int OCLImage2DFromBuffer::imageWidth = 1920; +const unsigned int OCLImage2DFromBuffer::imageHeight = 1080; + +const static char strKernel[] = + "__constant sampler_t imageSampler = CLK_NORMALIZED_COORDS_FALSE | " + "CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \n" + "__kernel void image2imageCopy( " + " \n" + " __read_only image2d_t input, " + " \n" + " __write_only image2d_t output) " + " \n" + "{ " + " \n" + " int2 coord = (int2)(get_global_id(0), get_global_id(1)); " + " \n" + " uint4 temp = read_imageui(input, imageSampler, coord); " + " \n" + " write_imageui(output, coord, temp); " + " \n" + "} " + " \n"; + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *clConvertImageAMD_fn)( + cl_context context, cl_mem image, const cl_image_format *image_format, + cl_int *errcode_ret); + +clConvertImageAMD_fn clConvertImageAMD; + +OCLImage2DFromBuffer::OCLImage2DFromBuffer() : OCLTestImp() { + _numSubTests = 6; + blockSizeX = GROUP_SIZE; + blockSizeY = 1; +} + +OCLImage2DFromBuffer::~OCLImage2DFromBuffer() {} + +void OCLImage2DFromBuffer::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + buffer = clImage2DOriginal = clImage2D = clImage2DOut = NULL; + done = false; + pitchAlignment = 0; + + _openTest = test; + // Initialize random number seed + srand((unsigned int)time(NULL)); + + OCLTestImp::open(test, units, conversion, deviceId); + if (_errorFlag) return; + + cl_device_type deviceType; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE, + sizeof(deviceType), &deviceType, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed"); + + if (!(deviceType & CL_DEVICE_TYPE_GPU)) { + testDescString = "GPU device is required for this test!\n"; + done = true; + return; + } + + if (_openTest >= 4) { + clConvertImageAMD = + (clConvertImageAMD_fn)clGetExtensionFunctionAddressForPlatform( + platform_, "clConvertImageAMD"); + if (clConvertImageAMD == NULL) { + testDescString = "clConvertImageAMD not found!\n"; + done = true; + return; + } + } + + CompileKernel(); + AllocateOpenCLImage(); +} + +void OCLImage2DFromBuffer::run(void) { + if (_errorFlag || done) { + return; + } + + if ((_openTest % 2) == 0) { + testReadImage(clImage2D); + } else { + testKernel(); + } +} + +void OCLImage2DFromBuffer::AllocateOpenCLImage() { + const bool pitchTest = (_openTest == 2 || _openTest == 3); + cl_int status = 0; + + size_t size = 0; + pitchAlignment = 0; + status = _wrapper->clGetDeviceInfo(devices_[_deviceId], + CL_DEVICE_IMAGE_PITCH_ALIGNMENT, + sizeof(cl_uint), &pitchAlignment, &size); + + if (pitchAlignment != 0) { + pitchAlignment--; + } + + const unsigned int requiredPitch = + ((imageWidth + pitchAlignment) & ~pitchAlignment); + const unsigned int pitch = (!pitchTest) ? requiredPitch : imageWidth; + const size_t bufferSize = pitch * imageHeight; + CHECK_RESULT(bufferSize == 0, "ERROR: calculated image size is zero"); + + unsigned char *sourceData = new unsigned char[bufferSize]; + + // init data + for (unsigned int y = 0; y < imageHeight; y++) { + for (unsigned int x = 0; x < imageWidth / 4; x++) { + for (unsigned int p = 0; p < 4; p++) { + *(sourceData + y * pitch + x * 4 + p) = p; + } + } + } + buffer = _wrapper->clCreateBuffer(context_, + CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, + bufferSize, sourceData, &status); + + { + // testing clConvertImageAMD + if (_openTest == 4 || _openTest == 5) { + const cl_image_format format = {CL_R, CL_UNSIGNED_INT8}; +#if defined(CL_VERSION_2_0) + const cl_image_desc desc = {CL_MEM_OBJECT_IMAGE2D, + imageWidth, + imageHeight, + 0, + 0, + pitch, + 0, + 0, + 0, + {buffer}}; +#else + const cl_image_desc desc = {CL_MEM_OBJECT_IMAGE2D, + imageWidth, + imageHeight, + 0, + 0, + pitch, + 0, + 0, + 0, + buffer}; +#endif + clImage2DOriginal = _wrapper->clCreateImage( + context_, CL_MEM_READ_WRITE, &format, &desc, NULL, &status); + CHECK_RESULT(status != CL_SUCCESS, "clCreateImage() failed"); + + const cl_image_format formatRGBA = {CL_RGBA, CL_UNSIGNED_INT8}; + + clImage2D = + clConvertImageAMD(context_, clImage2DOriginal, &formatRGBA, &status); + CHECK_RESULT(status != CL_SUCCESS, "clConvertImageAMD() failed"); + + cl_mem fishyBuffer = 0; + status = clGetImageInfo(clImage2D, CL_IMAGE_BUFFER, sizeof(fishyBuffer), + &fishyBuffer, 0); + CHECK_RESULT(status != CL_SUCCESS, + "clGetImageInfo(CL_IMAGE_BUFFER) failed"); + CHECK_RESULT(fishyBuffer != buffer, + "clGetImageInfo() failed, buffer != fishyBuffer"); + } else { + const cl_image_format format = {CL_RGBA, CL_UNSIGNED_INT8}; +#if defined(CL_VERSION_2_0) + const cl_image_desc desc = {CL_MEM_OBJECT_IMAGE2D, + imageWidth / 4, + imageHeight, + 0, + 0, + pitch, + 0, + 0, + 0, + {buffer}}; +#else + const cl_image_desc desc = {CL_MEM_OBJECT_IMAGE2D, + imageWidth / 4, + imageHeight, + 0, + 0, + pitch, + 0, + 0, + 0, + buffer}; +#endif + + clImage2D = _wrapper->clCreateImage(context_, CL_MEM_READ_WRITE, &format, + &desc, NULL, &status); + } + + // testing pitch alignment correct check in the runtime + if (pitchTest) { + CHECK_RESULT(requiredPitch != pitch && + (clImage2D != NULL || + status != CL_INVALID_IMAGE_FORMAT_DESCRIPTOR), + "AllocateOpenCLImage() failed: (clImage2D!=NULL || " + "status!=CL_INVALID_IMAGE_FORMAT_DESCRIPTOR) <=> (%p, %x)", + clImage2D, status); + if (requiredPitch != pitch) { + done = true; + return; + } + } + } + + delete[] sourceData; + + { + const cl_image_format format = {CL_RGBA, CL_UNSIGNED_INT8}; +#if defined(CL_VERSION_2_0) + const cl_image_desc desc = {CL_MEM_OBJECT_IMAGE2D, + imageWidth / 4, + imageHeight, + 0, + 0, + 0, + 0, + 0, + 0, + {NULL}}; +#else + const cl_image_desc desc = {CL_MEM_OBJECT_IMAGE2D, + imageWidth / 4, + imageHeight, + 0, + 0, + 0, + 0, + 0, + 0, + NULL}; +#endif + clImage2DOut = _wrapper->clCreateImage(context_, CL_MEM_READ_WRITE, &format, + &desc, NULL, &status); + } + CHECK_RESULT(clImage2D == NULL, "AllocateOpenCLImage() failed"); +} + +void OCLImage2DFromBuffer::testReadImage(cl_mem image) { + cl_int status = 0; + size_t bufferSize = imageWidth * imageHeight; + unsigned char *dstData = new unsigned char[bufferSize]; + + size_t origin[] = {0, 0, 0}; + size_t region[] = {imageWidth / 4, imageHeight, 1}; + + status = clEnqueueReadImage(cmdQueues_[_deviceId], image, 1, origin, region, + 0, 0, dstData, 0, 0, 0); + + ::clFinish(cmdQueues_[_deviceId]); + + for (unsigned int y = 0; y < imageHeight; y++) { + for (unsigned int x = 0; x < imageWidth / 4; x++) { + for (unsigned int p = 0; p < 4; p++) { + if (*(dstData + y * imageWidth + x * 4 + p) != p) { + CHECK_RESULT( + true, + "CheckCLImage: *(dstData+y*imageWidth+x*4+p)!=p => %i != %i", + *(dstData + y * imageWidth + x * 4 + p), p); + goto cleanup; + } + } + } + } +cleanup: + + delete[] dstData; +} + +void OCLImage2DFromBuffer::testKernel() { + CopyOpenCLImage(clImage2D); + + testReadImage(clImage2DOut); +} + +unsigned int OCLImage2DFromBuffer::close(void) { + if (clImage2DOriginal != NULL) clReleaseMemObject(clImage2DOriginal); + if (clImage2D != NULL) clReleaseMemObject(clImage2D); + if (clImage2DOut != NULL) clReleaseMemObject(clImage2DOut); + if (buffer != NULL) clReleaseMemObject(buffer); + return OCLTestImp::close(); +} + +void OCLImage2DFromBuffer::CopyOpenCLImage(cl_mem clImageSrc) { + cl_int status = 0; + + // Set appropriate arguments to the kernel2D + + // input buffer image + status = clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clImageSrc); + CHECK_RESULT((status != CL_SUCCESS), + "CopyOpenCLImage() failed at " + "clSetKernelArg(kernel_,0,sizeof(cl_mem),&clImageSrc)"); + status = clSetKernelArg(kernel_, 1, sizeof(cl_mem), &clImage2DOut); + CHECK_RESULT((status != CL_SUCCESS), + "CopyOpenCLImage() failed at " + "clSetKernelArg(kernel_,1,sizeof(cl_mem),&clImage2DOut)"); + + // Enqueue a kernel run call. + size_t global_work_offset[] = {0, 0}; + size_t globalThreads[] = {imageWidth / 4, imageHeight}; + size_t localThreads[] = {blockSizeX, blockSizeY}; + + status = clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, NULL, + globalThreads, NULL, 0, NULL, 0); + CHECK_RESULT((status != CL_SUCCESS), + "CopyOpenCLImage() failed at clEnqueueNDRangeKernel"); + + status = clFinish(cmdQueues_[_deviceId]); + CHECK_RESULT((status != CL_SUCCESS), "CopyOpenCLImage() failed at clFinish"); +} + +void OCLImage2DFromBuffer::CompileKernel() { + cl_int status = 0; + + size_t kernelSize = sizeof(strKernel); + const char *strs = (const char *)&strKernel[0]; + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strs, + &kernelSize, &status); + + status = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL, + NULL, NULL); + if (status != CL_SUCCESS) { + if (status == CL_BUILD_PROGRAM_FAILURE) { + cl_int logStatus; + size_t buildLogSize = 0; + logStatus = clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, buildLogSize, + NULL, &buildLogSize); + std::string buildLog; + buildLog.resize(buildLogSize); + + logStatus = clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, buildLogSize, + &buildLog[0], NULL); + printf("%s", buildLog.c_str()); + } + return; + } + // get a kernel object handle for a kernel with the given name + kernel_ = _wrapper->clCreateKernel(program_, "image2imageCopy", &status); + + size_t kernel2DWorkGroupSize = 0; + status = clGetKernelWorkGroupInfo(kernel_, devices_[_deviceId], + CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), + &kernel2DWorkGroupSize, 0); + + if ((blockSizeX * blockSizeY) > kernel2DWorkGroupSize) { + if (blockSizeX > kernel2DWorkGroupSize) { + blockSizeX = kernel2DWorkGroupSize; + blockSizeY = 1; + } + } +} diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLImage2DFromBuffer.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLImage2DFromBuffer.h new file mode 100644 index 0000000000..0c59b216d7 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLImage2DFromBuffer.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCLImage2DFromBuffer_H_ +#define _OCLImage2DFromBuffer_H_ + +#include "OCLTestImp.h" + +class OCLImage2DFromBuffer : public OCLTestImp { + public: + OCLImage2DFromBuffer(); + virtual ~OCLImage2DFromBuffer(); + + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId); + virtual void run(void); + virtual unsigned int close(void); + + protected: + static const unsigned int imageWidth; + static const unsigned int imageHeight; + + void testReadImage(cl_mem image); + void testKernel(); + void AllocateOpenCLImage(); + void CopyOpenCLImage(cl_mem clImageSrc); + void CompileKernel(); + + bool done; + size_t blockSizeX; /**< Work-group size in x-direction */ + size_t blockSizeY; /**< Work-group size in y-direction */ + cl_mem buffer; + cl_mem clImage2DOriginal; + cl_mem clImage2D; + cl_mem clImage2DOut; + cl_uint pitchAlignment; +}; + +#endif // _OCLImage2DFromBuffer_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLImageCopyPartial.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLImageCopyPartial.cpp new file mode 100644 index 0000000000..534c58ec6a --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLImageCopyPartial.cpp @@ -0,0 +1,347 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLImageCopyPartial.h" + +#include +#include +#include + +#include "CL/opencl.h" +#include "Timer.h" + +// Quiet pesky warnings +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +#define NUM_SIZES 2 +static const unsigned int Sizes0[NUM_SIZES] = {16384, 16384}; + +#define NUM_FORMATS 1 +static const cl_image_format formats[NUM_FORMATS] = {{CL_R, CL_UNSIGNED_INT16}}; +static const char *textFormats[NUM_FORMATS] = {"R8"}; +static const unsigned int formatSize[NUM_FORMATS] = {2 * sizeof(cl_uchar)}; + +static const unsigned int Iterations[2] = {1, OCLImageCopyPartial::NUM_ITER}; + +#define NUM_SUBTESTS 3 +OCLImageCopyPartial::OCLImageCopyPartial() { + _numSubTests = NUM_SIZES * NUM_SUBTESTS * NUM_FORMATS * 2; +} + +OCLImageCopyPartial::~OCLImageCopyPartial() {} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLImageCopyPartial::setData(void *ptr, unsigned int pitch, + unsigned int size, unsigned int value) { + unsigned int *ptr2 = (unsigned int *)ptr; + value = 0; + for (unsigned int i = 0; i < size >> 2; i++) { + ptr2[i] = value; + value++; + } +} + +void OCLImageCopyPartial::checkData(void *ptr, unsigned int pitch, + unsigned int size, unsigned int value) { + unsigned int *ptr2 = (unsigned int *)ptr; + value = 0; + for (unsigned int i = 0; i < size >> 2; i++) { + if (ptr2[i] != value) { + printf("Data validation failed at %d! Got 0x%08x 0x%08x 0x%08x 0x%08x\n", + i, ptr2[i], ptr2[i + 1], ptr2[i + 2], ptr2[i + 3]); + printf("Expected 0x%08x 0x%08x 0x%08x 0x%08x\n", value, value, value, + value); + CHECK_RESULT(true, "Data validation failed!"); + break; + } + value++; + } +} + +void OCLImageCopyPartial::open(unsigned int test, char *units, + double &conversion, unsigned int deviceId) { + cl_uint typeOfDevice = type_; + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + size_t queryOut = 0; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + srcBuffer_ = 0; + dstBuffer_ = 0; + srcImage_ = false; + dstImage_ = false; + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], typeOfDevice, + 0, NULL, &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + // if (num_devices > 0) + //{ + // platform = platforms[_platformIndex]; + // break; + //} +#if 0 + } +#endif + delete platforms; + } + + bufnum_ = (_openTest / (NUM_SIZES * NUM_SUBTESTS)) % NUM_FORMATS; + + if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) + 1) & 1) { + srcImage_ = true; + } + if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) + 1) & 2) { + dstImage_ = true; + } + + numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS * NUM_FORMATS)]; + + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = _wrapper->clGetDeviceIDs(platform, typeOfDevice, num_devices, + devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, + sizeof(size_t), &queryOut, NULL); + bufSizeW_ = (cl_uint)queryOut; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, + sizeof(size_t), &queryOut, NULL); + bufSizeH_ = (cl_uint)queryOut; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + cl_mem_flags flags = CL_MEM_WRITE_ONLY; + void *mem; + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {bufSizeW_, bufSizeH_, 1}; + size_t image_row_pitch; + size_t image_slice_pitch; + unsigned int memSize; + + if (_openTest % NUM_SIZES) { + origin[0] = bufSizeW_ - 16; + region[0] = 16; + } else { + origin[1] = bufSizeH_ - 16; + region[1] = 16; + } + + if (dstImage_) { + dstBuffer_ = + _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], bufSizeW_, + bufSizeH_, 0, NULL, &error_); + CHECK_RESULT(dstBuffer_ == 0, "clCreateImage(dstBuffer) failed"); + mem = _wrapper->clEnqueueMapImage( + cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region, + &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapImage failed"); + memSize = (unsigned int)image_row_pitch * (unsigned int)region[1]; + } else { + dstBuffer_ = _wrapper->clCreateBuffer( + context_, flags, region[0] * region[1] * formatSize[bufnum_], NULL, + &error_); + CHECK_RESULT(dstBuffer_ == 0, "clCreateBuffer(dstBuffer) failed"); + mem = _wrapper->clEnqueueMapBuffer( + cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_WRITE, 0, + region[0] * region[1] * formatSize[bufnum_], 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + memSize = + (unsigned int)region[0] * (unsigned int)region[1] * formatSize[bufnum_]; + image_row_pitch = 0; + } + unsigned int *ptr2 = (unsigned int *)mem; + for (unsigned int i = 0; i < memSize >> 2; i++) { + ptr2[i] = 0xdeadbeef; + } + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL); + + flags = CL_MEM_READ_ONLY; + if (srcImage_) { + srcBuffer_ = + _wrapper->clCreateImage2D(context_, flags, &formats[bufnum_], bufSizeW_, + bufSizeH_, 0, NULL, &error_); + CHECK_RESULT(srcBuffer_ == 0, "clCreateImage(srcBuffer) failed"); + mem = _wrapper->clEnqueueMapImage( + cmd_queue_, srcBuffer_, CL_TRUE, CL_MAP_WRITE, origin, region, + &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapImage failed"); + memSize = (unsigned int)image_row_pitch * (unsigned int)region[1]; + } else { + srcBuffer_ = _wrapper->clCreateBuffer( + context_, flags, region[0] * region[1] * formatSize[bufnum_], NULL, + &error_); + CHECK_RESULT(srcBuffer_ == 0, "clCreateBuffer(srcBuffer) failed"); + mem = _wrapper->clEnqueueMapBuffer( + cmd_queue_, srcBuffer_, CL_TRUE, CL_MAP_WRITE, 0, + region[0] * region[1] * formatSize[bufnum_], 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + memSize = + (unsigned int)region[0] * (unsigned int)region[1] * formatSize[bufnum_]; + image_row_pitch = 0; + } + setData(mem, (unsigned int)image_row_pitch, memSize, 0xdeadbeef); + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, srcBuffer_, mem, 0, NULL, NULL); +} + +void OCLImageCopyPartial::run(void) { + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {bufSizeW_, bufSizeH_, 1}; + + if (_openTest % NUM_SIZES) { + origin[0] = bufSizeW_ - 16; + region[0] = 16; + } else { + origin[1] = bufSizeH_ - 16; + region[1] = 16; + } + + // Warm up + if (srcImage_ == false) { + error_ = _wrapper->clEnqueueCopyBufferToImage( + cmd_queue_, srcBuffer_, dstBuffer_, 0, origin, region, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyBufferToImage failed"); + } else if (dstImage_ == false) { + error_ = _wrapper->clEnqueueCopyImageToBuffer( + cmd_queue_, srcBuffer_, dstBuffer_, origin, region, 0, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyImageToBuffer failed"); + } else { + error_ = + _wrapper->clEnqueueCopyImage(cmd_queue_, srcBuffer_, dstBuffer_, origin, + origin, region, 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyImage failed"); + } + error_ = _wrapper->clFinish(cmd_queue_); + CHECK_RESULT(error_, "clFinish failed"); + + const char *strSrc = NULL; + const char *strDst = NULL; + if (srcImage_) + strSrc = "img"; + else + strSrc = "buf"; + if (dstImage_) + strDst = "img"; + else + strDst = "buf"; + void *mem; + size_t image_row_pitch; + size_t image_slice_pitch; + unsigned int memSize; + if (dstImage_) { + mem = _wrapper->clEnqueueMapImage( + cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ, origin, region, + &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapImage failed"); + memSize = (unsigned int)image_row_pitch * (unsigned int)region[1]; + } else { + mem = _wrapper->clEnqueueMapBuffer( + cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ, 0, + region[0] * region[1] * formatSize[bufnum_], 0, NULL, NULL, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + memSize = + (unsigned int)region[0] * (unsigned int)region[1] * formatSize[bufnum_]; + image_row_pitch = 0; + } + checkData(mem, (unsigned int)image_row_pitch, memSize, 0x600df00d); + _wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL); + char buf[256]; + SNPRINTF(buf, sizeof(buf), " (%4dx%4d) fmt:%s src:%s dst:%s i: %4d (GB/s) ", + bufSizeW_, bufSizeH_, textFormats[bufnum_], strSrc, strDst, numIter); + testDescString = buf; +} + +unsigned int OCLImageCopyPartial::close(void) { + _wrapper->clFinish(cmd_queue_); + + if (srcBuffer_) { + error_ = _wrapper->clReleaseMemObject(srcBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(srcBuffer_) failed"); + } + if (dstBuffer_) { + error_ = _wrapper->clReleaseMemObject(dstBuffer_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(dstBuffer_) failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLImageCopyPartial.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLImageCopyPartial.h new file mode 100644 index 0000000000..fbb89f06a9 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLImageCopyPartial.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_ImageCopyCorners_H_ +#define _OCL_ImageCopyCorners_H_ + +#include "OCLTestImp.h" + +class OCLImageCopyPartial : public OCLTestImp { + public: + OCLImageCopyPartial(); + virtual ~OCLImageCopyPartial(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + static const unsigned int NUM_ITER = 1; + + cl_context context_; + cl_command_queue cmd_queue_; + cl_mem srcBuffer_; + cl_mem dstBuffer_; + cl_int error_; + + unsigned int bufSizeW_; + unsigned int bufSizeH_; + unsigned int bufnum_; + bool srcImage_; + bool dstImage_; + unsigned int numIter; + void setData(void* ptr, unsigned int pitch, unsigned int size, + unsigned int value); + void checkData(void* ptr, unsigned int pitch, unsigned int size, + unsigned int value); +}; + +#endif // _OCL_ImageCopyPartial_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLKernelBinary.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLKernelBinary.cpp new file mode 100644 index 0000000000..5ae9932a1e --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLKernelBinary.cpp @@ -0,0 +1,252 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLKernelBinary.h" + +#include +#include +#include + +#include "CL/cl.h" + +const static char* strKernel12 = + "typedef struct ST { \n" + " int i0; \n" + " int i1; \n" + "} ST_t; \n" + " \n" + "__constant ST_t STCArray[2] = { \n" + " { 1, 0 }, \n" + " { 2, 1 } \n" + "}; \n" + " \n" + "__kernel void foo (__global int *p, int n) \n" + "{ \n" + " int s = 0; \n" + " int i; \n" + " for (i=0; i < n; ++i) { \n" + " s += STCArray[i].i0 + STCArray[i].i1; \n" + " } \n" + " *p = s; \n" + "} \n"; + +const static char* strKernel20 = + "typedef struct ST { \n" + " int i0; \n" + " int i1; \n" + "} ST_t; \n" + " \n" + "__constant ST_t STCArray[2] = { \n" + " { -1, 0 }, \n" + " { 3, -1 } \n" + "}; \n" + " \n" + "__global int var = 1; \n" + " \n" + "__kernel void foo (__global int *p, int n) \n" + "{ \n" + " int s = 0; \n" + " int i; \n" + " for (i=0; i < n; ++i) { \n" + " s += STCArray[i].i0 + STCArray[i].i1 + var++; \n" + " } \n" + " p[get_global_id(0)] = s; \n" + "} \n"; + +OCLKernelBinary::OCLKernelBinary() { _numSubTests = 2; } + +OCLKernelBinary::~OCLKernelBinary() {} + +void OCLKernelBinary::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + + char strVersion[128]; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_VERSION, + sizeof(strVersion), strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + if (test == 1 && strVersion[7] < '2') { + program_ = NULL; + return; + } + + const char *options, *options0; + const char* strKernel; + switch (test) { + case 0: + options = ""; + options0 = "-O0"; + strKernel = strKernel12; + break; + case 1: + options = "-cl-std=CL2.0"; + options0 = "-cl-std=CL2.0 -O0"; + strKernel = strKernel20; + break; + default: + assert(false); + return; + } + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], options, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + size_t* sizes = new size_t[deviceCount_]; + CHECK_RESULT(((sizes != NULL) ? false : true), "malloc()"); + size_t* sizes1 = new size_t[deviceCount_]; + CHECK_RESULT(((sizes1 != NULL) ? false : true), "malloc()"); + size_t* sizes2 = new size_t[deviceCount_]; + CHECK_RESULT(((sizes2 != NULL) ? false : true), "malloc()"); + + unsigned int programInfoDeviceIdIndex = 0; + cl_device_id* programInfoDevices = new cl_device_id[deviceCount_]; + CHECK_RESULT(((programInfoDevices != NULL) ? false : true), "malloc()"); + // get an array of device Id's that relate to values order returned by + // 'clGetProgramInfo' + error_ = _wrapper->clGetProgramInfo(program_, CL_PROGRAM_DEVICES, + sizeof(cl_device_id) * deviceCount_, + programInfoDevices, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clGetProgramInfo()"); + // map between the class devices_ array and the programInfoDeviceId array + for (unsigned int i = 0; i < deviceCount_; i++) { + if (devices_[deviceId] == programInfoDevices[i]) { + programInfoDeviceIdIndex = i; + } + } + delete[] programInfoDevices; + + error_ = + _wrapper->clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES, + sizeof(size_t) * deviceCount_, sizes, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clGetProgramInfo()"); + + unsigned char** binaries = new unsigned char*[deviceCount_]; + CHECK_RESULT(((binaries != NULL) ? false : true), "malloc()"); + + for (unsigned int i = 0; i < deviceCount_; i++) { + if (sizes[i] > 0) { + binaries[i] = new unsigned char[sizes[i]]; + CHECK_RESULT(((binaries[i] != NULL) ? false : true), "malloc()"); + } else { + binaries[i] = NULL; + } + } + + error_ = _wrapper->clGetProgramInfo(program_, CL_PROGRAM_BINARIES, + sizeof(unsigned char*) * deviceCount_, + binaries, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clGetProgramInfo()"); + + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT((error_ != CL_SUCCESS), "clReleaseProgram()"); + + const unsigned char* cBinary = binaries[programInfoDeviceIdIndex]; + cl_int status; + program_ = _wrapper->clCreateProgramWithBinary( + context_, 1, &devices_[deviceId], &(sizes[programInfoDeviceIdIndex]), + &cBinary, &status, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithBinary()"); + + for (unsigned int i = 0; i < deviceCount_; i++) { + if (binaries[i] != NULL) delete[] binaries[i]; + } + delete[] binaries; + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], options0, + NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clGetProgramInfo()"); + + error_ = + _wrapper->clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES, + sizeof(size_t) * deviceCount_, sizes1, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "1st clGetProgramInfo()"); + + kernel_ = _wrapper->clCreateKernel(program_, "foo", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "1st clCreateKernel() failed"); + + _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT((error_ != CL_SUCCESS), "1st clReleaseKernel() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], options0, + NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clGetProgramInfo()"); + + error_ = + _wrapper->clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES, + sizeof(size_t) * deviceCount_, sizes2, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "2nd clGetProgramInfo()"); + + kernel_ = _wrapper->clCreateKernel(program_, "foo", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "2nd clCreateKernel() failed"); + + cl_mem buffer; + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + 2 * sizeof(cl_uint), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + + delete[] sizes; + delete[] sizes1; + delete[] sizes2; +} + +void OCLKernelBinary::run(void) { + if (program_ == NULL) { + return; + } + + cl_mem buffer = buffers()[0]; + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + cl_int num = 2; + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_int), &num); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + size_t gws[1] = {2}; + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + cl_uint outputV[2] = {0}; + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffer, true, 0, + 2 * sizeof(cl_uint), outputV, 0, NULL, + NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); + if (outputV[0] != 4) { + CHECK_RESULT(true, "Incorrect result of kernel execution!"); + } +} + +unsigned int OCLKernelBinary::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLKernelBinary.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLKernelBinary.h new file mode 100644 index 0000000000..6453393d76 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLKernelBinary.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_KERNEL_BINARY_H_ +#define _OCL_KERNEL_BINARY_H_ + +#include "OCLTestImp.h" + +class OCLKernelBinary : public OCLTestImp { + public: + OCLKernelBinary(); + virtual ~OCLKernelBinary(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); +}; + +#endif // _OCL_KERNEL_BINARY_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLLDS32K.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLDS32K.cpp new file mode 100644 index 0000000000..b33c624efd --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLDS32K.cpp @@ -0,0 +1,371 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLLDS32K.h" + +#include +#include +#include +#include + +#include "CL/cl.h" +// #include +#include + +typedef unsigned int uint32_t; + +#define LDS_SIZE 32768 +#define LOCAL_WORK_SIZE 64 + +// We'll do a 64MB transaction +#define A_SIZE (8 * 1024 * 1024) +#define B_SIZE A_SIZE +#define C_SIZE A_SIZE +#define D_SIZE A_SIZE + +#define GLOBAL_WORK_SIZE (A_SIZE / LDS_SIZE * LOCAL_WORK_SIZE) + +#define TEST_NAME "lds 32K" + +// 32K has 8192 elements +// 64 threads each handle 8192/64=128 values +static const char program_source[] = KERNEL( + __kernel void the_kernel(__global const uint *a, __global const uint *b, + __global const uint *c, __global uint *d, + __global uint *e) { + __local uint lds[8192]; + uint gid = get_global_id(0); + __global const uint *ta = a + 128 * gid; + __global const uint *tb = b + 128 * gid; + __global const uint *tc = c + 128 * gid; + __global uint *td = d + 128 * gid; + uint i; + + for (i = 0; i < 128; ++i) lds[ta[i]] = tc[i]; + + barrier(CLK_LOCAL_MEM_FENCE); + + for (i = 0; i < 128; ++i) td[i] = lds[tb[i]]; + } __kernel void the_kernel2(__global uint *d) { + __local uint lds[8192]; + uint i; + uint gid = get_global_id(0); + + for (i = 0; i < 128; ++i) lds[i] = d[gid]; + barrier(CLK_LOCAL_MEM_FENCE); + + for (i = 0; i < 128; ++i) d[gid] = lds[i]; + }); + +static void fill(uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d, + uint32_t *e) { + uint32_t i, j, k, t; + static uint32_t p[LDS_SIZE / 4]; + static int is_set = 0; + + if (!is_set) { + for (i = 0; i < LDS_SIZE / 4; ++i) p[i] = i; + is_set = 1; + } + + for (j = 0; j < A_SIZE / LDS_SIZE; ++j) { + for (i = 0; i < LDS_SIZE / 4; ++i) { + k = rand() % (LDS_SIZE / 4); + t = p[i]; + p[i] = p[k]; + p[k] = t; + + c[i] = rand(); + } + memcpy(a, p, LDS_SIZE); + + for (i = 0; i < LDS_SIZE / 4; ++i) { + k = rand() % (LDS_SIZE / 4); + t = p[i]; + p[i] = p[k]; + p[k] = t; + + d[i] = 0xfeedbeefU; + } + memcpy(b, p, LDS_SIZE); + + a += LDS_SIZE / 4; + b += LDS_SIZE / 4; + c += LDS_SIZE / 4; + d += LDS_SIZE / 4; + } +} + +static int check(const uint32_t *a, const uint32_t *b, const uint32_t *c, + const uint32_t *d, const uint32_t *e) { + uint32_t i, j, t; + uint32_t lds[LDS_SIZE / 4]; + + for (j = 0; j < A_SIZE / LDS_SIZE; ++j) { + for (i = 0; i < LDS_SIZE / 4; ++i) lds[i] = 0xdeadbeef; + + for (i = 0; i < LDS_SIZE / 4; ++i) lds[a[i]] = c[i]; + + for (i = 0; i < LDS_SIZE / 4; ++i) { + t = lds[b[i]]; + if (d[i] != t) { + printf("mismatch group %u thread %u element %u: %u instead of %u\n", j, + i / 128, i % 128, d[i], t); + return EXIT_FAILURE; + } + } + + a += LDS_SIZE / 4; + b += LDS_SIZE / 4; + c += LDS_SIZE / 4; + d += LDS_SIZE / 4; + } + return EXIT_SUCCESS; +} + +#ifndef E_SIZE +#define E_SIZE 32 +#endif + +void OCLLDS32K::setup_run(const char *cmplr_opt) { + cl_ulong lsize; + const char *ps[2]; + error_ = + _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_LOCAL_MEM_SIZE, + sizeof(lsize), &lsize, NULL); + if (lsize < LDS_SIZE) { + fprintf(stderr, "Passed! Test does not support 32kb of lds space!"); + return; + } + + // create the program + ps[0] = program_source; + program_ = + _wrapper->clCreateProgramWithSource(context_, 1, ps, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed"); + + // build the program + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], + cmplr_opt, NULL, NULL); + if (error_ != CL_SUCCESS) { + char build_log[16384]; + size_t log_sz; + fprintf(stderr, "build program failed, err=%d\n", error_); + error_ = _wrapper->clGetProgramBuildInfo( + program_, devices_[_deviceId], CL_PROGRAM_BUILD_LOG, sizeof(build_log), + build_log, &log_sz); + if (error_ != CL_SUCCESS) + fprintf(stderr, "failed to get build log, err=%d\n", error_); + else + fprintf(stderr, "----- Build Log -----\n%s\n----- ----- --- -----\n", + build_log); + return; + } + + // create the kernel + kernel_ = _wrapper->clCreateKernel(program_, "the_kernel", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "create a kernel failed"); + + // create the kernel + kernel2_ = _wrapper->clCreateKernel(program_, "the_kernel2", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "create a kernel failed"); + + // allocate the buffer memory objects + a_buf_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, A_SIZE, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "create a buffer a failed"); + buffers_.push_back(a_buf_); + + b_buf_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, B_SIZE, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "create a buffer b failed"); + buffers_.push_back(b_buf_); + + c_buf_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, C_SIZE, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "create a buffer c failed"); + buffers_.push_back(c_buf_); + + d_buf_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, D_SIZE, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "create a buffer d failed"); + buffers_.push_back(d_buf_); + + e_buf_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, E_SIZE, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "create a buffer e failed"); + buffers_.push_back(e_buf_); + + // set the args values + error_ = + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&a_buf_); + error_ |= + _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&b_buf_); + error_ |= + _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_mem), (void *)&c_buf_); + error_ |= + _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_mem), (void *)&d_buf_); + error_ |= + _wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_mem), (void *)&e_buf_); + CHECK_RESULT((error_ != CL_SUCCESS), "setkernelArg failed!"); + + error_ = + _wrapper->clSetKernelArg(kernel2_, 0, sizeof(cl_mem), (void *)&d_buf_); + CHECK_RESULT((error_ != CL_SUCCESS), "setkernelArg failed!"); +} + +void OCLLDS32K::cleanup_run() { + if (kernel2_) { + _wrapper->clReleaseKernel(kernel2_); + } +} + +void OCLLDS32K::exec_kernel(void *a_mem, void *b_mem, void *c_mem, void *d_mem, + void *e_mem) { + size_t global_work_size[1]; + size_t local_work_size[1]; + + // Send data to device + error_ = _wrapper->clEnqueueWriteBuffer( + cmdQueues_[_deviceId], a_buf_, CL_TRUE, 0, A_SIZE, a_mem, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueWritebuffer failed"); + + error_ = _wrapper->clEnqueueWriteBuffer( + cmdQueues_[_deviceId], b_buf_, CL_TRUE, 0, B_SIZE, b_mem, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueWritebuffer failed"); + + error_ = _wrapper->clEnqueueWriteBuffer( + cmdQueues_[_deviceId], c_buf_, CL_TRUE, 0, C_SIZE, c_mem, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueWritebuffer failed"); + + // set work-item dimensions + global_work_size[0] = GLOBAL_WORK_SIZE; + local_work_size[0] = LOCAL_WORK_SIZE; + + // execute kernel + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, global_work_size, + local_work_size, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel failed"); + + // execute kernel + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, global_work_size, + local_work_size, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel failed"); + + // execute kernel + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, global_work_size, + local_work_size, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel failed"); + + // read results + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], d_buf_, CL_TRUE, + 0, D_SIZE, d_mem, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed"); + + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], e_buf_, CL_TRUE, + 0, E_SIZE, e_mem, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed"); + + error_ = _wrapper->clFinish(cmdQueues_[_deviceId]); + CHECK_RESULT((error_ != CL_SUCCESS), "clFinish failed"); +} + +const char *OCLLDS32K::kernel_src = ""; + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +OCLLDS32K::OCLLDS32K() { _numSubTests = 1; } + +OCLLDS32K::~OCLLDS32K() {} + +void OCLLDS32K::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + _deviceId = deviceId; + testID_ = test; + OCLTestImp::open(test, units, conversion, _deviceId); +} + +void OCLLDS32K::run(void) { + void *a; + void *b; + void *c; + void *d; + void *e; + const char *cmplr_opt = NULL; + int j, nj; + double f, dj, p; + + nj = 5; + + setup_run(cmplr_opt); + CHECK_RESULT((error_ != CL_SUCCESS), "setup_run failed!"); + + p = 10.0; + dj = 100.0 / (double)nj; + + a = malloc(A_SIZE); + CHECK_RESULT((a == NULL), "malloc failed"); + memset(a, 0, A_SIZE); + + b = malloc(B_SIZE); + CHECK_RESULT((b == NULL), "malloc failed"); + memset(b, 0, B_SIZE); + + c = malloc(C_SIZE); + CHECK_RESULT((c == NULL), "malloc failed"); + memset(c, 0, C_SIZE); + + d = malloc(D_SIZE); + CHECK_RESULT((d == NULL), "malloc failed"); + memset(d, 0, D_SIZE); + + e = malloc(E_SIZE); + CHECK_RESULT((e == NULL), "malloc failed"); + memset(e, 0, E_SIZE); + + // printf("Testing " TEST_NAME " on %s\n", argv[1]); + for (j = 0; j < nj; ++j) { + fill((uint32_t *)a, (uint32_t *)b, (uint32_t *)c, (uint32_t *)d, + (uint32_t *)e); + // printf("%s Test %d: ", sDevice, j); + exec_kernel(a, b, c, d, e); + CHECK_RESULT((error_ != CL_SUCCESS), "exec_kernel failed!"); + + CHECK_RESULT((check((uint32_t *)a, (uint32_t *)b, (uint32_t *)c, + (uint32_t *)d, (uint32_t *)e) < 0), + " Failed!\n"); + f = (j + 1) * dj; + if (nj > 1 && f >= p) { + // printf("%.1lf%%...\n", f); + // fflush(stdout); + p += 10.0; + } + } +} + +unsigned int OCLLDS32K::close(void) { + cleanup_run(); + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLLDS32K.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLDS32K.h new file mode 100644 index 0000000000..e398e9e615 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLDS32K.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_LDS32K_H_ +#define _OCL_LDS32K_H_ +#include "OCLTestImp.h" + +class OCLLDS32K : public OCLTestImp { + public: + OCLLDS32K(); + virtual ~OCLLDS32K(); + + public: + virtual void open(unsigned int test, char *units, double &conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + void setup_run(const char *cmplr_opt); + void cleanup_run(); + void exec_kernel(void *a_mem, void *b_mem, void *c_mem, void *d_mem, + void *e_mem); + static const char *kernel_src; + cl_kernel kernel2_; + + private: + unsigned int testID_; + cl_mem a_buf_; + cl_mem b_buf_; + cl_mem c_buf_; + cl_mem d_buf_; + cl_mem e_buf_; +}; + +#endif // _OCL_LDS32K_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLLinearFilter.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLinearFilter.cpp new file mode 100644 index 0000000000..a9fd35287c --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLinearFilter.cpp @@ -0,0 +1,187 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLLinearFilter.h" + +#include +#include +#include + +#include "CL/cl.h" + +const static size_t ImageSize = 4; + +const static char* strKernel = + "const sampler_t g_Sampler = CLK_FILTER_LINEAR | \n" + " CLK_ADDRESS_CLAMP_TO_EDGE | \n" + " CLK_NORMALIZED_COORDS_FALSE; \n" + " \n" + "__kernel void linear3D(__read_only image3d_t img3D, __global float4* " + "f4Tata) \n" + "{ \n" + " float4 f4Index = { 2.25f, 1.75f, 0.5f, 0.0f }; \n" + " // copy interpolated data in result buffer \n" + " f4Tata[0] = read_imagef(img3D, g_Sampler, f4Index); \n" + "} \n" + " \n" + "__kernel void linear2D(__read_only image2d_t img2D, __global float4* " + "f4Tata) \n" + "{ \n" + " float2 f2Index = { 2.25f, 1.75f }; \n" + " // copy interpolated data in result buffer \n" + " f4Tata[0] = read_imagef(img2D, g_Sampler, f2Index); \n" + "} \n" + " \n"; + +OCLLinearFilter::OCLLinearFilter() { _numSubTests = 2; } + +OCLLinearFilter::~OCLLinearFilter() {} + +void OCLLinearFilter::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + cl_bool imageSupport; + size_t size; + for (size_t i = 0; i < deviceCount_; ++i) { + _wrapper->clGetDeviceInfo(devices_[i], CL_DEVICE_IMAGE_SUPPORT, + sizeof(imageSupport), &imageSupport, &size); + if (!imageSupport) { + return; + } + } + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + const char* kernels[2] = {"linear3D", "linear2D"}; + kernel_ = _wrapper->clCreateKernel(program_, kernels[test], &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + cl_mem memory; + size_t offset[3] = {0, 0, 0}; + cl_image_format imageFormat = {CL_RGBA, CL_FLOAT}; + + if (test == 0) { + float data[ImageSize][ImageSize][ImageSize][4]; + float index = 0.f; + size_t region[3] = {ImageSize, ImageSize, ImageSize}; + for (size_t z = 0; z < ImageSize; ++z) { + for (size_t y = 0; y < ImageSize; ++y) { + for (size_t x = 0; x < ImageSize; ++x) { + data[z][y][x][0] = (float)x; + data[z][y][x][1] = (float)y; + data[z][y][x][2] = (float)z; + data[z][y][x][3] = 1.0f; + } + } + } + memory = _wrapper->clCreateImage3D(context_, CL_MEM_READ_ONLY, &imageFormat, + ImageSize, ImageSize, ImageSize, 0, 0, + NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateImage() failed"); + + error_ = _wrapper->clEnqueueWriteImage(cmdQueues_[_deviceId], memory, true, + offset, region, 0, 0, data, 0, NULL, + NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteImage() failed"); + } else { + float data[4][ImageSize][ImageSize]; + size_t region[3] = {ImageSize, ImageSize, 1}; + for (size_t y = 0; y < ImageSize; ++y) { + for (size_t x = 0; x < ImageSize; ++x) { + data[y][x][0] = (float)x; + data[y][x][1] = (float)y; + data[y][x][2] = data[y][x][3] = 1.0f; + } + } + + memory = _wrapper->clCreateImage2D(context_, CL_MEM_READ_ONLY, &imageFormat, + ImageSize, ImageSize, 0, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateImage() failed"); + error_ = _wrapper->clEnqueueWriteImage(cmdQueues_[_deviceId], memory, true, + offset, region, 0, 0, data, 0, NULL, + NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteImage() failed"); + } + buffers_.push_back(memory); + + memory = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + 4 * sizeof(cl_float), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(memory); +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLLinearFilter::run(void) { + cl_bool imageSupport; + size_t size; + for (size_t i = 0; i < deviceCount_; ++i) { + _wrapper->clGetDeviceInfo(devices_[i], CL_DEVICE_IMAGE_SUPPORT, + sizeof(imageSupport), &imageSupport, &size); + if (!imageSupport) { + return; + } + } + cl_float values[4] = {0.f, 0.f, 0.f, 0.f}; + cl_float ref[2] = {1.75f, 1.25f}; + cl_mem image = buffers()[0]; + cl_mem buffer = buffers()[1]; + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &image); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + size_t gws[1] = {0x1}; + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffer, true, 0, + 4 * sizeof(cl_float), values, 0, NULL, + NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); + for (cl_uint i = 0; i < 2; ++i) { + if (values[i] != ref[i]) { + printf("%.2f != %.2f [ref]", values[i], ref[i]); + CHECK_RESULT(true, " - Incorrect result for linear filtering!\n"); + } + } +} + +unsigned int OCLLinearFilter::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLLinearFilter.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLinearFilter.h new file mode 100644 index 0000000000..e0b007c5f6 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLinearFilter.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_LINEAR_FILTER_H_ +#define _OCL_LINEAR_FILTER_H_ + +#include "OCLTestImp.h" + +class OCLLinearFilter : public OCLTestImp { + public: + OCLLinearFilter(); + virtual ~OCLLinearFilter(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); +}; + +#endif // _OCL_LINEAR_FILTER_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLLiquidFlash.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLiquidFlash.cpp new file mode 100644 index 0000000000..f5afad8f42 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLiquidFlash.cpp @@ -0,0 +1,264 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLLiquidFlash.h" + +#include +#include +#include +#include + +#include +#include +#include + +#include "CL/cl.h" + +const static size_t ChunkSize = 256 * 1024; +const static int NumSizes = 5; +const static int NumChunksArray[NumSizes] = {1, 4, 16, 32, 56}; +const static size_t MaxSubTests = 4 * NumSizes; +const static char* BinFileName = "LiquidFlash.bin"; +const static int NumIterArray[NumSizes] = {20, 15, 10, 10, 10}; +const static int NumStagesArray[NumSizes] = {2, 2, 4, 4, 4}; + +OCLLiquidFlash::OCLLiquidFlash() { +#ifdef CL_VERSION_2_0 + _numSubTests = MaxSubTests; + failed_ = false; + maxSize_ = 0; + direct_ = false; + amdFile_ = NULL; +#else + _numSubTests = 0; + failed_ = false; + maxSize_ = 0; + direct_ = false; +#endif +} + +OCLLiquidFlash::~OCLLiquidFlash() {} + +void OCLLiquidFlash::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { +#ifdef CL_VERSION_2_0 + failed_ = false; + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + testID_ = test; + char name[1024] = {0}; + size_t size = 0; + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 1024, + name, &size); + + if (!strstr(name, "cl_amd_liquid_flash")) { + printf("Liquid flash extension is required for this test!\n"); + failed_ = true; + return; + } + + NumChunks = NumChunksArray[testID_ / 4]; + NumIter = NumIterArray[testID_ / 4]; + NumStages = NumStagesArray[testID_ / 4]; + BufferSize = NumChunks * ChunkSize * sizeof(cl_uint); + direct_ = ((testID_ % 4) < 3) ? true : false; + createFile = + (clCreateSsgFileObjectAMD_fn)clGetExtensionFunctionAddressForPlatform( + platform_, "clCreateSsgFileObjectAMD"); + retainFile = + (clRetainSsgFileObjectAMD_fn)clGetExtensionFunctionAddressForPlatform( + platform_, "clRetainSsgFileObjectAMD"); + releaseFile = + (clReleaseSsgFileObjectAMD_fn)clGetExtensionFunctionAddressForPlatform( + platform_, "clReleaseSsgFileObjectAMD"); + writeBufferFromFile = + (clEnqueueReadSsgFileAMD_fn)clGetExtensionFunctionAddressForPlatform( + platform_, "clEnqueueReadSsgFileAMD"); + if (createFile == NULL || retainFile == NULL || releaseFile == NULL || + writeBufferFromFile == NULL) { + testDescString = "Failed to initialize LiquidFlash extension!\n"; + failed_ = true; + return; + } + + size_t chunkSize = ChunkSize; + std::ofstream fs; + fs.open(BinFileName, std::fstream::binary); + + if (fs.is_open()) { + // allocate memory for file content + cl_uint* buffer = new cl_uint[chunkSize]; + for (cl_uint i = 0; i < chunkSize; ++i) { + buffer[i] = i; + } + for (int i = 0; i < NumChunks; ++i) { + fs.write(reinterpret_cast(buffer), chunkSize * sizeof(cl_uint)); + } + delete[] buffer; + } + fs.close(); + + std::string str(BinFileName); + std::wstring wc(str.length(), L' '); + // Copy string to wstring. + std::copy(str.begin(), str.end(), wc.begin()); + + amdFile_ = createFile(context_, CL_FILE_READ_ONLY_AMD, wc.c_str(), &error_); + if (error_ != CL_SUCCESS) { + printf( + "Create file failed. Liquid flash support is required for this " + "test!\n"); + failed_ = true; + return; + } + + cl_mem buf = NULL; + if (direct_) { + cl_uint subTest = testID_ % 4; + cl_uint memFlags = (subTest == 0) + ? CL_MEM_USE_PERSISTENT_MEM_AMD + : ((subTest == 1) ? CL_MEM_ALLOC_HOST_PTR : 0); + buf = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY | memFlags, + BufferSize, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clEnqueueWriteBufferFromFileAMD() failed"); + } else { + for (int i = 0; i < NumStages; ++i) { + buf = _wrapper->clCreateBuffer(context_, + CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, + BufferSize / NumStages, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clEnqueueWriteBufferFromFileAMD() failed"); + buffers_.push_back(buf); + } + + buf = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, BufferSize, + NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clEnqueueWriteBufferFromFileAMD() failed"); + } + buffers_.push_back(buf); +#endif +} + +void OCLLiquidFlash::run(void) { +#ifdef CL_VERSION_2_0 + if (failed_) { + return; + } + size_t finalBuf = (direct_) ? 0 : NumStages; + + cl_uint* buffer = new cl_uint[NumChunks * ChunkSize]; + size_t iterSize = BufferSize / NumStages; + memset(buffer, 0, BufferSize); + if (direct_) { + error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], buffers_[0], + CL_TRUE, 0, BufferSize, buffer, 0, + NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed"); + } else { + for (int i = 0; i < NumStages; ++i) { + error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], + buffers_[i], CL_TRUE, 0, iterSize, + buffer, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed"); + } + error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], + buffers_[finalBuf], CL_TRUE, 0, + BufferSize, buffer, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed"); + } + + CPerfCounter timer; + + double sec = 0.; + + for (int i = 0; i < NumIter; ++i) { + timer.Reset(); + timer.Start(); + if (direct_) { + error_ = writeBufferFromFile( + cmdQueues_[_deviceId], buffers_[0], CL_FALSE, 0 /*buffer_offset*/, + BufferSize, amdFile_ /*file*/, 0 /*file_offset*/, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "writeBufferFromFile() failed"); + } else { + for (int i = 0; i < NumStages; ++i) { + error_ = writeBufferFromFile( + cmdQueues_[_deviceId], buffers_[i], CL_FALSE, 0 /*buffer_offset*/, + iterSize, amdFile_ /*file*/, iterSize * i /*file_offset*/, 0, NULL, + NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "writeBufferFromFile() failed"); + + error_ = _wrapper->clEnqueueCopyBuffer( + cmdQueues_[_deviceId], buffers_[i], buffers_[NumStages], 0, + iterSize * i, iterSize, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CopyBuffer() failed"); + _wrapper->clFlush(cmdQueues_[_deviceId]); + } + } + _wrapper->clFinish(cmdQueues_[_deviceId]); + timer.Stop(); + double cur = timer.GetElapsedTime(); + if (i == 0) { + sec = cur; + } else { + sec = std::min(cur, sec); + } + } + + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], + buffers_[finalBuf], CL_TRUE, 0, + BufferSize, buffer, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "Validation failed!"); + + for (int c = 0; c < NumChunks; ++c) { + for (cl_uint i = 0; i < ChunkSize; ++i) { + if (buffer[c * ChunkSize + i] != i) { + CHECK_RESULT(false, "Validation failed!"); + } + } + } + delete[] buffer; + + static const char* MemTypeStr[] = {"Visible ", "Remote ", "Invisible", + "Staging"}; + _perfInfo = (float)BufferSize / ((float)sec * 1024.f * 1024.f); + std::stringstream str; + str << "WriteBufferFromFile performance ("; + str << BufferSize / (1024 * 1024); + str << " MB of " << MemTypeStr[testID_ % 4] << ") transfer speed (MB/s):"; + testDescString = str.str(); +#endif +} + +unsigned int OCLLiquidFlash::close(void) { +#ifdef CL_VERSION_2_0 + if (!failed_) { + if (amdFile_ != NULL) { + releaseFile(amdFile_); + } + if (remove(BinFileName) != 0) { + } + } + return OCLTestImp::close(); +#else + return CL_SUCCESS; +#endif +} diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLLiquidFlash.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLiquidFlash.h new file mode 100644 index 0000000000..a44d4ffdb2 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLLiquidFlash.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_LIQUID_FLASH_H_ +#define _OCL_LIQUID_FLASH_H_ + +#include "OCLTestImp.h" + +class OCLLiquidFlash : public OCLTestImp { + public: + OCLLiquidFlash(); + virtual ~OCLLiquidFlash(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; + unsigned int testID_; + cl_ulong maxSize_; +#ifdef CL_VERSION_2_0 + cl_file_amd amdFile_; +#endif + bool direct_; + size_t BufferSize; + int NumChunks; + int NumIter; + int NumStages; +#ifdef CL_VERSION_2_0 + clCreateSsgFileObjectAMD_fn createFile; + clRetainSsgFileObjectAMD_fn retainFile; + clReleaseSsgFileObjectAMD_fn releaseFile; + clEnqueueReadSsgFileAMD_fn writeBufferFromFile; +#endif +}; + +#endif // _OCL_LIQUID_FLASH_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMapCount.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMapCount.cpp new file mode 100644 index 0000000000..5746f19b88 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMapCount.cpp @@ -0,0 +1,98 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLMapCount.h" + +#include +#include +#include + +#include "CL/cl.h" + +OCLMapCount::OCLMapCount() { _numSubTests = 1; } + +OCLMapCount::~OCLMapCount() {} + +void OCLMapCount::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + size_t size; + clMemWrapper memObject; + + // Get the address alignment, so we can make sure the sub buffer test later + // works properly + cl_uint addressAlign; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], + CL_DEVICE_MEM_BASE_ADDR_ALIGN, + sizeof(addressAlign), &addressAlign, NULL); + if (addressAlign < 128) addressAlign = 128; + + void* void_buffer = malloc(addressAlign * 4); + + // Create a buffer to test against + memObject = _wrapper->clCreateBuffer(context_, + CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, + addressAlign * 4, void_buffer, &error_); + if (error_) { + free(void_buffer); + printf("Unable to create buffer to test"); + } + + // Map buffer + void* mapped = _wrapper->clEnqueueMapBuffer( + cmdQueues_[deviceId], memObject, true, CL_MAP_READ, 0, addressAlign * 4, + 0, NULL, NULL, &error_); + + cl_uint mapCount; + + // Find the number of mappings on buffer after map + error_ = _wrapper->clGetMemObjectInfo(memObject, CL_MEM_MAP_COUNT, + sizeof(mapCount), &mapCount, &size); + CHECK_RESULT((error_ != CL_SUCCESS), "Unable to get mem object map count"); + if (mapCount != 1) { + printf( + "ERROR: Returned mem object map count does not validate! (expected %d, " + "got %d)\n", + 1, mapCount); + return; + } + + // Unmap buffer + error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[deviceId], memObject, + mapped, 0, NULL, NULL); + + // Find the number of mappings on buffer after unmap + error_ = _wrapper->clGetMemObjectInfo(memObject, CL_MEM_MAP_COUNT, + sizeof(mapCount), &mapCount, &size); + CHECK_RESULT((error_ != CL_SUCCESS), "Unable to get mem object map count"); + if (mapCount != 0) { + printf( + "ERROR: Returned mem object map count does not validate! (expected %d, " + "got %d)\n", + 0, mapCount); + return; + } +} + +void OCLMapCount::run(void) {} + +unsigned int OCLMapCount::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMapCount.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMapCount.h new file mode 100644 index 0000000000..7f3f09e7a8 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMapCount.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_MAP_COUNT_H_ +#define _OCL_MAP_COUNT_H_ + +#include "OCLTestImp.h" + +class OCLMapCount : public OCLTestImp { + public: + OCLMapCount(); + virtual ~OCLMapCount(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); +}; + +#endif // _OCL_MAP_COUNT_H_ + +class clMemWrapper { + public: + clMemWrapper() { mMem = NULL; } + clMemWrapper(cl_mem mem) { mMem = mem; } + ~clMemWrapper() { + if (mMem != NULL) clReleaseMemObject(mMem); + } + + clMemWrapper& operator=(const cl_mem& rhs) { + mMem = rhs; + return *this; + } + operator cl_mem() { return mMem; } + + cl_mem* operator&() { return &mMem; } + + bool operator==(const cl_mem& rhs) { return mMem == rhs; } + + protected: + cl_mem mMem; +}; diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.cpp new file mode 100644 index 0000000000..27d6eec6f4 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.cpp @@ -0,0 +1,153 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLMemDependency.h" + +#include +#include +#include + +#include "CL/cl.h" + +const static cl_uint Stages = 4; +const static cl_uint ThreadsForCheck = 1 << Stages; + +#define KERNEL_CODE(...) #__VA_ARGS__ + +const static char* strKernel = KERNEL_CODE( +\n __kernel void bitonicSort(__global uint2* keys, uint stage, uint pass) { + const uint thread = get_global_id(0); + + const uint pairDistance = 1 << (stage - pass); + + /* The purpose of this is to introduce an additional zero at stage - pass + * bit*/ + const uint leftID = + (thread & (pairDistance - 1)) | + ((thread & ~(pairDistance - 1)) << 1); /* Is the same as below */ + + const uint direction = ((thread >> stage) & 1) == 1 ? 0 : 1; + + const uint rightID = leftID + pairDistance; + const uint2 left = keys[leftID]; + const uint2 right = keys[rightID]; + + const uint2 larger = left.x > right.x ? left : right; + const uint2 smaller = left.x > right.x ? right : left; + + keys[leftID] = direction ? smaller : larger; + keys[rightID] = direction ? larger : smaller; +} +\n); + +OCLMemDependency::OCLMemDependency() { _numSubTests = 1; } + +OCLMemDependency::~OCLMemDependency() {} + +void OCLMemDependency::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + char dbuffer[1024] = {0}; + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "bitonicSort", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + cl_mem buffer; + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + ThreadsForCheck * sizeof(cl_uint2), NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + cl_buffer_region reg = {0, ThreadsForCheck * sizeof(cl_uint2)}; + buffer = + _wrapper->clCreateSubBuffer(buffers()[0], CL_MEM_READ_WRITE, + CL_BUFFER_CREATE_TYPE_REGION, ®, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLMemDependency::run(void) { + cl_uint2 values[ThreadsForCheck] = { + {{3, 0}}, {{1, 5}}, {{4, 6}}, {{2, 4}}, {{0, 3}}, {{5, 10}}, + {{15, 7}}, {{13, 8}}, {{10, 2}}, {{9, 1}}, {{7, 11}}, {{11, 9}}, + {{14, 12}}, {{12, 14}}, {{6, 13}}, {{8, 15}}}; + cl_uint2 reference[ThreadsForCheck] = { + {{0, 3}}, {{1, 5}}, {{3, 0}}, {{2, 4}}, {{4, 6}}, {{5, 10}}, + {{6, 13}}, {{8, 15}}, {{7, 11}}, {{9, 1}}, {{10, 2}}, {{11, 9}}, + {{14, 12}}, {{12, 14}}, {{15, 7}}, {{13, 8}}}; + cl_uint2 results[ThreadsForCheck]; + + cl_mem buffer = buffers()[0]; + error_ = + _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], buffer, true, 0, + sizeof(values), values, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed"); + + size_t gws[1] = {ThreadsForCheck}; + + for (unsigned int i = 0; i < Stages; ++i) { + buffer = buffers()[i % 2]; + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + for (unsigned int j = 0; j < i; ++j) { + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(unsigned int), &i); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(unsigned int), &j); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel( + cmdQueues_[_deviceId], kernel_, 1, NULL, gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + } + + buffer = buffers()[0]; + error_ = + _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffer, true, 0, + sizeof(results), results, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); + for (unsigned int i = 0; i < ThreadsForCheck; ++i) { + if ((results[i].s[0] != reference[i].s[0]) || + (results[i].s[1] != reference[i].s[1])) { + CHECK_RESULT(true, "Incorrect result for dependency!\n"); + } + } +} + +unsigned int OCLMemDependency::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.h new file mode 100644 index 0000000000..2308ae25b8 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemDependency.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_MEM_DEPENDENCY_H_ +#define _OCL_MEM_DEPENDENCY_H_ + +#include "OCLTestImp.h" + +class OCLMemDependency : public OCLTestImp { + public: + OCLMemDependency(); + virtual ~OCLMemDependency(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); +}; + +#endif // _OCL_MEM_DEPENDENCY_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemObjs.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemObjs.cpp new file mode 100644 index 0000000000..8e4b3122ad --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemObjs.cpp @@ -0,0 +1,139 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLMemObjs.h" + +#include +#include +#include + +#include +#include +#include +#include + +const char* OCLMemObjs::kernel_src = ""; + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +OCLMemObjs::OCLMemObjs() { _numSubTests = 1; } + +OCLMemObjs::~OCLMemObjs() {} + +void OCLMemObjs::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; +} + +int OCLMemObjs::test(void) { + cl_int err; + + std::vector platforms; + cl::Platform::get(&platforms); + if (platforms.empty()) { + std::cerr << "Platform::get() failed \n"; + return EXIT_FAILURE; + } + cl_context_properties properties[] = { + CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0}; + cl::Context context(CL_DEVICE_TYPE_ALL, properties, NULL, NULL, &err); + if (err != CL_SUCCESS) { + std::cerr << "Context::Context() failed (" << err << ")\n"; + return EXIT_FAILURE; + } + + std::vector devices = context.getInfo(); + if (err != CL_SUCCESS) { + std::cerr << "Context::getInfo() failed (" << err << ")\n"; + return EXIT_FAILURE; + } + if (devices.size() == 0) { + std::cerr << "No device available\n"; + return EXIT_FAILURE; + } + + const char source[] = "__kernel void test_memobjs(__global int* ptr) {}"; + cl::Program::Sources sources(1, std::make_pair(source, 0)); + + cl::Program program(context, sources, &err); + if (err != CL_SUCCESS) { + std::cerr << "Program::Program() failed (" << err << ")\n"; + return EXIT_FAILURE; + } + err = program.build(devices); + if (err != CL_SUCCESS) { + std::cerr << "Program::build() failed (" << err << ")\n"; + return EXIT_FAILURE; + } + + cl::Kernel kernel(program, "test_memobjs", &err); + if (err != CL_SUCCESS) { + std::cerr << "Kernel::Kernel() failed (" << err << ")\n"; + return EXIT_FAILURE; + } + if (err != CL_SUCCESS) { + std::cerr << "Kernel::setArg() failed (" << err << ")\n"; + return EXIT_FAILURE; + } + + cl::CommandQueue queue(context, devices[0], 0, &err); + if (err != CL_SUCCESS) { + std::cerr << "CommandQueue::CommandQueue() failed (" << err << ")\n"; + return EXIT_FAILURE; + } + + cl::Buffer buffer(context, (cl_mem_flags)0, 1024, NULL, &err); + if (err != CL_SUCCESS) { + std::cerr << "Buffer::Buffer() failed (" << err << ")\n"; + return EXIT_FAILURE; + } + + err = kernel.setArg(0, buffer); + if (err != CL_SUCCESS) { + std::cerr << "Kernel::setArg() failed (" << err << ")\n"; + return EXIT_FAILURE; + } + + err = queue.enqueueTask(kernel); + if (err != CL_SUCCESS) { + std::cerr << "CommandQueue::enqueueTask() failed (" << err << ")\n"; + } + + // Force a clReleaseMemoryObject on buffer before dispatch. + buffer = cl::Buffer(); + + err = queue.finish(); + if (err != CL_SUCCESS) { + std::cerr << "CommandQueue::finish() failed (" << err << ")\n"; + } + + // std::cout << " Test: Pass!\n"; + return EXIT_SUCCESS; +} + +void OCLMemObjs::run(void) { + CHECK_RESULT((test() != EXIT_SUCCESS), "test failed"); +} + +unsigned int OCLMemObjs::close(void) { return _crcword; } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemObjs.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemObjs.h new file mode 100644 index 0000000000..c3a414eb4b --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemObjs.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_Mem_Objs_H_ +#define _OCL_Mem_Objs_H_ + +#include "CL/cl.h" +#include "OCLTestImp.h" + +class OCLMemObjs : public OCLTestImp { + public: + OCLMemObjs(); + virtual ~OCLMemObjs(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + int test(void); + + static const char* kernel_src; + + private: + cl_int error; +}; + +#endif // _OCL_Mem_Objs_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemoryInfo.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemoryInfo.cpp new file mode 100644 index 0000000000..bbd3fdc085 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemoryInfo.cpp @@ -0,0 +1,200 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLMemoryInfo.h" + +#include +#include +#include +#include + +#include "CL/cl.h" +#include "CL/cl_ext.h" + +OCLMemoryInfo::OCLMemoryInfo() { + // Run the second test with 64 bit only + _numSubTests = (sizeof(int*) == 8) ? 2 : 1; + failed_ = false; +} + +OCLMemoryInfo::~OCLMemoryInfo() {} + +void OCLMemoryInfo::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + _deviceId = deviceId; + test_ = test; + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + cl_device_type deviceType; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE, + sizeof(deviceType), &deviceType, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed"); + + if (!(deviceType & CL_DEVICE_TYPE_GPU)) { + printf("GPU device is required for this test!\n"); + failed_ = true; + return; + } + + char name[1024] = {0}; + size_t size = 0; + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 1024, + name, &size); + if (!strstr(name, "cl_amd_device_attribute_query")) { + printf("AMD device attribute extension is required for this test!\n"); + failed_ = true; + return; + } + // Observed failures with APUs on GSL path due to incorrect available memory, + // reported for visible heap + cl_bool is_apu = false; + error_ = clGetDeviceInfo(devices_[deviceId], CL_DEVICE_HOST_UNIFIED_MEMORY, + sizeof(cl_bool), &is_apu, nullptr); + if (is_apu && (test == 1)) { + printf("Test not supported for apus, skipping...\n"); + failed_ = true; + return; + } +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLMemoryInfo::run(void) { + if (failed_) { + return; + } + + size_t BufSize = 0x1000000; + bool succeed = false; + bool done = false; + if (test_ == 0) { + // use multiple loops to make sure the failure case is not caused + // by reusing the allocation from the cached memory pool + for (int i = 0; i < 5 && !done; i++) { + cl_mem buffer; + size_t memoryInfo[2]; + _wrapper->clGetDeviceInfo(devices_[_deviceId], + CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, + 2 * sizeof(size_t), memoryInfo, NULL); + + buffer = + _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, + BufSize * sizeof(cl_int4), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + + unsigned int* values; + values = reinterpret_cast(new cl_int4[BufSize]); + + // Clear destination buffer + memset(values, 0, BufSize * sizeof(cl_int4)); + error_ = _wrapper->clEnqueueWriteBuffer( + cmdQueues_[_deviceId], buffer, CL_TRUE, 0, BufSize * sizeof(cl_int4), + values, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed"); + + size_t memoryInfo2[2]; + _wrapper->clGetDeviceInfo(devices_[_deviceId], + CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, + 2 * sizeof(size_t), memoryInfo2, NULL); + + size_t dif = memoryInfo[0] - memoryInfo2[0]; + if (dif == 0) { // the buffer memory may come from the cached memory pool + BufSize *= 2; // double the size and try again + } else if ((dif >= + (static_cast(BufSize * sizeof(cl_int4) * 1.5f) / + 1024)) || + (dif <= ((BufSize * sizeof(cl_int4) / 2) / 1024))) { + done = true; + } else { + succeed = true; + done = true; + } + + delete[] values; + } + } else { + int i = 0; + size_t sizeAll; + size_t memoryInfo[2]; + _wrapper->clGetDeviceInfo(devices_[_deviceId], + CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, + 2 * sizeof(size_t), memoryInfo, NULL); + unsigned int* values; + values = reinterpret_cast(new cl_int4[BufSize]); + memset(values, 0, BufSize * sizeof(cl_int4)); + // Loop a few times to make sure the results are consistent + for (int k = 0; k < 3; ++k) { + sizeAll = 0; + while (true) { + cl_mem buffer; + + buffer = + _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, + BufSize * sizeof(cl_int4), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + + // Clear destination buffer + error_ = _wrapper->clEnqueueWriteBuffer( + cmdQueues_[_deviceId], buffer, CL_TRUE, 0, + BufSize * sizeof(cl_int4), values, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed"); + + sizeAll += BufSize * sizeof(cl_int4) / 1024; + size_t memoryInfo2[2]; + _wrapper->clGetDeviceInfo(devices_[_deviceId], + CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, + 2 * sizeof(size_t), memoryInfo2, NULL); + if (memoryInfo2[0] < (0x50000 + (BufSize * sizeof(cl_int4) / 1024))) { + break; + } + size_t dif = memoryInfo[0] - memoryInfo2[0]; + // extra memory could be allocated/destroyed in the driver + if ((dif / sizeAll) == 1 || (sizeAll / dif) == 1) { + succeed = true; + } else { + succeed = false; + break; + } + ++i; + } + for (auto& it : buffers()) { + error_ = _wrapper->clReleaseMemObject(it); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), + "clReleaseMemObject() failed"); + } + buffers_.clear(); + if (!succeed) { + break; + } + } + delete[] values; + } + + if (!succeed) { + CHECK_RESULT(true, "Reported free memory doesn't match allocated size!"); + } +} + +unsigned int OCLMemoryInfo::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemoryInfo.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemoryInfo.h new file mode 100644 index 0000000000..8c36d53709 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMemoryInfo.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_MEMORY_INFO_H_ +#define _OCL_MEMORY_INFO_H_ + +#include "OCLTestImp.h" + +class OCLMemoryInfo : public OCLTestImp { + public: + OCLMemoryInfo(); + virtual ~OCLMemoryInfo(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; + uint32_t test_; +}; + +#endif // _OCL_MEMORY_INFO_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.cpp new file mode 100644 index 0000000000..743cd45815 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.cpp @@ -0,0 +1,295 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLMultiQueue.h" + +#include +#include +#include + +#include +#include + +#include "CL/cl.h" + +const static char* strKernel = + "__kernel void \n" + "copyInc(__global uint* dst, __global uint* src) \n" + "{ \n" + " uint index = get_global_id(0); \n" + " \n" + " dst[index] = src[index] + 1; \n" + "} \n"; + +static bool useGPU = true; + +static const cl_uint NumQueues = 8; // must be power of 2 +static cl_uint NumElements = 4096; +static const cl_uint NumRuns = 16384; +static const cl_uint ExecutionsPerQueue = 256; +std::stringstream lerror; + +class MemTransfer { + public: + MemTransfer(OCLWrapper* wrapper, cl_context context, cl_command_queue queue, + cl_uint numElements) + : wrapper_(wrapper), + context_(context), + queue_(queue), + numElements_(numElements), + count_(0) {} + + ~MemTransfer() { + wrapper_->clReleaseMemObject(dst_); + wrapper_->clReleaseMemObject(src_); + } + + bool create() { + cl_int err; + size_t size = numElements_ * sizeof(cl_uint); + cl_uint* data = new cl_uint[numElements_]; + memset(data, 0, size); + + src_ = wrapper_->clCreateBuffer(context_, CL_MEM_COPY_HOST_PTR, size, data, + &err); + if (src_ == NULL) { + lerror << "clReleaseContext failed"; + delete[] data; + return false; + } + dst_ = wrapper_->clCreateBuffer(context_, 0, size, NULL, &err); + if (dst_ == NULL) { + lerror << "clCreateBuffer() failed"; + delete[] data; + return false; + } + + delete[] data; + return true; + } + + bool run(cl_kernel kernel) { + size_t global_work_size[1]; + size_t local_work_size[1]; + size_t size = numElements_ * sizeof(cl_uint); + + global_work_size[0] = (numElements_ + 63) / 64 * 64; + local_work_size[0] = 64; + + if (CL_SUCCESS != + wrapper_->clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&dst_)) { + return false; + } + + if (CL_SUCCESS != + wrapper_->clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&src_)) { + return false; + } + + if (CL_SUCCESS != wrapper_->clEnqueueNDRangeKernel( + queue_, kernel, 1, NULL, + (const size_t*)global_work_size, + (const size_t*)local_work_size, 0, NULL, NULL)) { + lerror << "clEnqueueNDRangeKernel() failed"; + return false; + } + + // Copy dst into src + if (CL_SUCCESS != wrapper_->clEnqueueCopyBuffer(queue_, dst_, src_, 0, 0, + size, 0, 0, NULL)) { + lerror << "clEnqueueCopyBuffer() failed"; + return false; + } + count_++; + return true; + } + + bool check() { + size_t size = numElements_ * sizeof(cl_uint); + cl_event event; + void* ptr = wrapper_->clEnqueueMapBuffer(queue_, src_, CL_TRUE, CL_MAP_READ, + 0, size, 0, NULL, NULL, NULL); + cl_uint* data = reinterpret_cast(ptr); + + for (cl_uint i = 0; i < numElements_; ++i) { + if (data[i] != count_) { + return false; + } + } + wrapper_->clEnqueueUnmapMemObject(queue_, src_, ptr, 0, NULL, &event); + wrapper_->clWaitForEvents(1, &event); + wrapper_->clReleaseEvent(event); + return true; + } + + void flush() { wrapper_->clFlush(queue_); } + + private: + OCLWrapper* wrapper_; + cl_context context_; + cl_command_queue queue_; + cl_uint numElements_; + cl_uint count_; + cl_mem dst_; + cl_mem src_; +}; + +MemTransfer* work[NumQueues]; + +bool test(cl_kernel, cl_uint, cl_uint); + +OCLMultiQueue::OCLMultiQueue() { + _numSubTests = 0; + for (cl_uint i = 1; i <= NumQueues; i <<= 1, _numSubTests++) + ; + failed_ = false; +} + +OCLMultiQueue::~OCLMultiQueue() {} + +void OCLMultiQueue::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + test_ = test; + cl_device_type deviceType; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE, + sizeof(deviceType), &deviceType, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed"); + + if (!(deviceType & CL_DEVICE_TYPE_GPU)) { + testDescString = "GPU device is required for this test!\n"; + failed_ = true; + return; + } + size_t maxWorkGroupSize = 1; + cl_uint computePower = 1; + error_ = _wrapper->clGetDeviceInfo( + devices_[deviceId], CL_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL); + computePower *= static_cast(maxWorkGroupSize); + cl_uint maxComputeUnits = 1; + error_ = _wrapper->clGetDeviceInfo( + devices_[deviceId], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(maxComputeUnits), + &maxComputeUnits, NULL); + computePower *= 32 * maxComputeUnits; + NumElements = (NumElements < static_cast(computePower)) + ? static_cast(computePower) + : NumElements; + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + kernel_ = _wrapper->clCreateKernel(program_, "copyInc", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); +} + +void OCLMultiQueue::run(void) { + if (failed_) { + return; + } + + // Run test + cl_uint queues = 1 << test_; + if (!test(kernel_, NumRuns / queues, queues)) { + lerror << "We failed a test run!"; + CHECK_RESULT(true, lerror.str().c_str()); + } +} + +unsigned int OCLMultiQueue::close(void) { return OCLTestImp::close(); } + +bool OCLMultiQueue::test(cl_kernel kernel, cl_uint numRuns, cl_uint numQueues) { + cl_command_queue cmd_queue[NumQueues]; + CPerfCounter timer; + + for (cl_uint i = 0; i < numQueues; ++i) { + cmd_queue[i] = _wrapper->clCreateCommandQueue(context_, devices_[_deviceId], + 0, &error_); + if (cmd_queue[i] == (cl_command_queue)0) { + _wrapper->clReleaseContext(context_); + testDescString = "clCreateCommandQueue() failed"; + return false; + } + work[i] = new MemTransfer(_wrapper, context_, cmd_queue[i], NumElements); + if (work[i] == NULL || !work[i]->create()) { + testDescString = "Test creation failed"; + return false; + } + } + + timer.Reset(); + timer.Start(); + + cl_uint dispatchCount = ExecutionsPerQueue / numQueues; + for (cl_uint i = 0; i < numRuns; ++i) { + for (cl_uint j = 0; j < numQueues; ++j) { + if (!work[j]->run(kernel)) { + testDescString = "Execution failed"; + return false; + } + // Every queue should have a dispatch after 256 executions, + // but the time for dispatch on each queue + // will be shifted on dispatchCount + if (((i % dispatchCount) == 0) && + (((i / dispatchCount) % numQueues) == j)) { + work[j]->flush(); + } + } + } + + for (cl_uint i = 0; i < numQueues; ++i) { + _wrapper->clFinish(cmd_queue[i]); + } + + timer.Stop(); + + for (cl_uint j = 0; j < numQueues; ++j) { + if (!work[j]->check()) { + testDescString = "Result Check fails!"; + return false; + } + } + std::stringstream stream; + + stream << "Num Queues: " << numQueues << ", Executions Per Queue: "; + stream.flags(std::ios::right | std::ios::showbase); + stream.width(5); + stream << numRuns; + stream.precision(3); + stream << ", Time: " << (float)(timer.GetElapsedTime()) << " seconds"; + + for (cl_uint i = 0; i < numQueues; ++i) { + delete work[i]; + _wrapper->clReleaseCommandQueue(cmd_queue[i]); + } + testDescString = stream.str(); + + return true; +} diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.h new file mode 100644 index 0000000000..8b27b878a3 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLMultiQueue.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_MULTI_QUEUE_H_ +#define _OCL_MULTI_QUEUE_H_ + +#include "OCLTestImp.h" + +class OCLMultiQueue : public OCLTestImp { + public: + OCLMultiQueue(); + virtual ~OCLMultiQueue(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool test(cl_kernel kernel, cl_uint numRuns, cl_uint numQueues); + bool failed_; + unsigned int test_; +}; + +#endif // _OCL_ASYNC_TRANSFER_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLOfflineCompilation.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLOfflineCompilation.cpp new file mode 100644 index 0000000000..44317a3610 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLOfflineCompilation.cpp @@ -0,0 +1,206 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLOfflineCompilation.h" + +#include +#include +#include +#include + +#include "CL/cl.h" +#include "CL/cl_ext.h" +#include "cl_kernel_info_amd.h" + +typedef CL_API_ENTRY cl_int(CL_API_CALL* clGetKernelInfoAMD_fn)( + cl_kernel kernel, cl_device_id device, cl_kernel_info_amd param_name, + size_t param_value_size, void* param_value, size_t* param_value_size_ret); + +clGetKernelInfoAMD_fn clGetKernelInfoAMDp; + +#define BLIT_KERNEL(...) #__VA_ARGS__ + +const char* strKernel12 = BLIT_KERNEL( +\n const constant uint test = 1; __kernel void factorial(__global uint* out) { + uint id = get_global_id(0); + uint factorial = 1; + out[id] = factorial + test; +} +\n); + +const char* strKernel20 = BLIT_KERNEL( +\n const constant uint test = 1; global uint test2 = 0; + __kernel void factorial(__global uint* out) { + uint id = get_global_id(0); + uint factorial = 1; + out[id] = factorial + test; + if (id == 0) { + out[id] += test2++; + } + } +\n); + +OCLOfflineCompilation::OCLOfflineCompilation() { _numSubTests = 1; } + +OCLOfflineCompilation::~OCLOfflineCompilation() {} + +void OCLOfflineCompilation::open(unsigned int test, char* units, + double& conversion, unsigned int deviceId) { + size_t nDevices = 0; + cl_device_id* devices = NULL; + + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + _wrapper->clReleaseContext(context_); + + cl_context_properties cprops[5]; + clGetKernelInfoAMDp = + (clGetKernelInfoAMD_fn)clGetExtensionFunctionAddressForPlatform( + platform_, "clGetKernelInfoAMD"); + if (clGetKernelInfoAMDp == NULL) { + testDescString = "clGetKernelInfoAMD not found!\n"; + return; + } + + // Utilize the CL_CONTEXT_OFFLINE_DEVICES_AMD platform option to allow for + // the generation of binary kernel without target device installed in build + // system. + cprops[0] = CL_CONTEXT_PLATFORM; + cprops[1] = (cl_context_properties)platform_; + cprops[2] = CL_CONTEXT_OFFLINE_DEVICES_AMD; + cprops[3] = (cl_context_properties)1; + cprops[4] = (cl_context_properties)0; // end of options list marker + + // Create a context with all of the available devices. + context_ = _wrapper->clCreateContextFromType(cprops, CL_DEVICE_TYPE_GPU, NULL, + NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContextFromType() failed"); + + size_t deviceListSize = 0; + error_ = _wrapper->clGetContextInfo(context_, CL_CONTEXT_NUM_DEVICES, + sizeof(size_t), &deviceListSize, NULL); + CHECK_RESULT(((error_ != CL_SUCCESS) || (deviceListSize == 0)), + "clGetContextInfo() failed"); + + devices = (cl_device_id*)malloc(sizeof(cl_device_id) * deviceListSize); + CHECK_RESULT((devices == NULL), "clGetContextInfo() failed"); + + memset(devices, 0, deviceListSize); + + error_ = _wrapper->clGetContextInfo(context_, CL_CONTEXT_DEVICES, + sizeof(cl_device_id) * deviceListSize, + devices, &nDevices); + CHECK_RESULT((error_ != CL_SUCCESS), "clGetContextInfo() failed"); + + for (unsigned version = 1; version <= 2; ++version) { + std::string options; + const char* strKernel; + + switch (version) { + case 1: + options = ""; + strKernel = strKernel12; + break; + case 2: + options = "-cl-std=CL2.0"; + strKernel = strKernel20; + break; + default: + assert(false); + return; + } + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, + NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + for (unsigned int i = 0; i < deviceListSize; ++i) { + char name[128]; + char strVersion[128]; + _wrapper->clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(name), name, + NULL); + error_ = _wrapper->clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, + sizeof(strVersion), strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + if (version == 2 && strVersion[7] < '2') { + continue; + } + + // skipping the test on gfx9+ for now till we add compiler support for al + // the gfx10+ subdevices + cl_uint gfxip_major = 0; + cl_uint gfxip_minor = 0; + clGetDeviceInfo(devices[i], CL_DEVICE_GFXIP_MAJOR_AMD, + sizeof(gfxip_major), &gfxip_major, NULL); + clGetDeviceInfo(devices[i], CL_DEVICE_GFXIP_MINOR_AMD, + sizeof(gfxip_minor), &gfxip_minor, NULL); + + printf("Building on %s, OpenCL version %s, (options '%s')\n", name, + (version == 2 ? "2.0" : "1.2"), options.c_str()); + error_ = _wrapper->clBuildProgram(program_, 1, &devices[i], + options.c_str(), NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo( + program_, devices[i], CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + break; + } + kernel_ = _wrapper->clCreateKernel(program_, "factorial", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + size_t usedVGPRs = 0; + error_ = + clGetKernelInfoAMDp(kernel_, devices[i], CL_KERNELINFO_USED_VGPRS, + sizeof(usedVGPRs), &usedVGPRs, NULL); + CHECK_RESULT(((error_ != CL_SUCCESS) || (usedVGPRs == 0)), + "clGetKernelInfoAMD() failed"); + + _wrapper->clReleaseKernel(kernel_); + kernel_ = nullptr; + + size_t binSize; + error_ = _wrapper->clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES, + sizeof(size_t), &binSize, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clGetProgramInfo() failed"); + char* binary = new char[binSize]; + error_ = _wrapper->clGetProgramInfo(program_, CL_PROGRAM_BINARIES, + sizeof(char*), &binary, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clGetProgramInfo() failed"); + delete[] binary; + } + if (version == 1) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT((error_ != CL_SUCCESS), "clReleaseProgram() failed"); + } + } + free(devices); +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLOfflineCompilation::run(void) {} + +unsigned int OCLOfflineCompilation::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLOfflineCompilation.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLOfflineCompilation.h new file mode 100644 index 0000000000..ec8c438309 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLOfflineCompilation.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_OFFLINE_COMPILATION_H_ +#define _OCL_OFFLINE_COMPILATION_H_ + +#include "OCLTestImp.h" + +class OCLOfflineCompilation : public OCLTestImp { + public: + OCLOfflineCompilation(); + virtual ~OCLOfflineCompilation(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); +}; + +#endif // _OCL_OFFLINE_COMPILATION_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLP2PBuffer.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLP2PBuffer.cpp new file mode 100644 index 0000000000..1cc9127b98 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLP2PBuffer.cpp @@ -0,0 +1,286 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLP2PBuffer.h" + +#include +#include +#include +#include + +#include +#include +#include + +#include "CL/cl.h" + +const static size_t ChunkSize = 256 * 1024; +const static int NumSizes = 5; +const static int NumRuns = 4; +const static int NumChunksArray[NumSizes] = {1, 4, 16, 32, 64}; +const static size_t MaxSubTests = NumRuns * NumSizes; +const static int NumIterArray[NumSizes] = {20, 15, 10, 10, 10}; + +OCLP2PBuffer::OCLP2PBuffer() { +#ifdef CL_VERSION_2_0 + _numSubTests = MaxSubTests; +#else + _numSubTests = 0; +#endif + failed_ = false; + maxSize_ = 0; + context0_ = nullptr; + context1_ = nullptr; + cmdQueue0_ = nullptr; + cmdQueue1_ = nullptr; +} + +OCLP2PBuffer::~OCLP2PBuffer() {} + +void OCLP2PBuffer::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { +#ifdef CL_VERSION_2_0 + cl_uint numPlatforms = 0; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + if (deviceCount_ < 2) { + printf("\nTwo GPUs are required to run P2P test\n"); + failed_ = true; + return; + } + + testID_ = test; + char name[1024] = {0}; + size_t size = 0; + _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_EXTENSIONS, 1024, name, + &size); + if (!strstr(name, "cl_amd_copy_buffer_p2p")) { + printf("P2P extension is required for this test!\n"); + failed_ = true; + return; + } + + _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_EXTENSIONS, 1024, name, + &size); + if (!strstr(name, "cl_amd_copy_buffer_p2p")) { + printf("P2P extension is required for this test!\n"); + failed_ = true; + return; + } + num_p2p_0_ = 0; + _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_NUM_P2P_DEVICES_AMD, + sizeof(num_p2p_0_), &num_p2p_0_, nullptr); + if (num_p2p_0_ != 0) { + cl_device_id* p2p = new cl_device_id[num_p2p_0_]; + _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_P2P_DEVICES_AMD, + sizeof(cl_device_id) * num_p2p_0_, p2p, nullptr); + delete[] p2p; + } + num_p2p_1_ = 0; + _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_NUM_P2P_DEVICES_AMD, + sizeof(num_p2p_1_), &num_p2p_1_, nullptr); + if (num_p2p_1_ != 0) { + cl_device_id* p2p = new cl_device_id[num_p2p_1_]; + _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_P2P_DEVICES_AMD, + sizeof(cl_device_id) * num_p2p_1_, p2p, nullptr); + delete[] p2p; + } + + cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)platform, 0}; + context0_ = + _wrapper->clCreateContext(props, 1, &devices_[0], NULL, 0, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext#0 failed"); + + context1_ = + _wrapper->clCreateContext(props, 1, &devices_[1], NULL, 0, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext#1 failed"); + + NumChunks = NumChunksArray[testID_ % NumSizes]; + NumIter = NumIterArray[testID_ % NumSizes]; + BufferSize = NumChunks * ChunkSize * sizeof(cl_uint); + + p2p_copy_ = + (clEnqueueCopyBufferP2PAMD_fn)clGetExtensionFunctionAddressForPlatform( + platform_, "clEnqueueCopyBufferP2PAMD"); + if (p2p_copy_ == NULL) { + testDescString = "Failed to initialize P2P extension!\n"; + failed_ = true; + return; + } + + cl_queue_properties prop[] = {CL_QUEUE_PROPERTIES, 0, 0}; + cmdQueue0_ = _wrapper->clCreateCommandQueueWithProperties( + context0_, devices_[0], prop, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateCommandQueueWithProperties() failed"); + cmdQueue1_ = _wrapper->clCreateCommandQueueWithProperties( + context1_, devices_[1], prop, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateCommandQueueWithProperties() failed"); + + size_t chunkSize = ChunkSize; + + cl_mem buf = NULL; + cl_uint memFlags = 0; + buf = _wrapper->clCreateBuffer(context0_, CL_MEM_READ_ONLY | memFlags, + BufferSize, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buf); + + buf = + _wrapper->clCreateBuffer(context1_, memFlags, BufferSize, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buf); +#endif +} + +void OCLP2PBuffer::run(void) { +#ifdef CL_VERSION_2_0 + if (failed_) { + return; + } + size_t finalBuf = 0; + cl_uint subTest = (testID_ / NumSizes) % 2; + + cl_uint* buffer = new cl_uint[NumChunks * ChunkSize]; + cl_uint* buffer2 = new cl_uint[NumChunks * ChunkSize]; + cl_event event; + + memset(buffer, 0x23, BufferSize); + error_ = _wrapper->clEnqueueWriteBuffer(cmdQueue1_, buffers_[1], CL_TRUE, 0, + BufferSize, buffer, 0, nullptr, + (subTest == 0) ? &event : nullptr); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed"); + + memset(buffer2, 0xEB, BufferSize); + error_ = _wrapper->clEnqueueWriteBuffer(cmdQueue0_, buffers_[0], CL_TRUE, 0, + BufferSize, buffer2, 0, nullptr, + (subTest == 1) ? &event : nullptr); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed"); + + CPerfCounter timer; + + double sec = 0.; + if (subTest == 0) { + error_ = p2p_copy_(cmdQueue0_, buffers_[0], buffers_[1], 0, 0, BufferSize, + 1, &event, nullptr); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueCopyBufferP2PAMD() failed"); + _wrapper->clFinish(cmdQueue0_); + } else { + error_ = p2p_copy_(cmdQueue1_, buffers_[1], buffers_[0], 0, 0, BufferSize, + 1, &event, nullptr); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueCopyBufferP2PAMD() failed"); + _wrapper->clFinish(cmdQueue1_); + } + clReleaseEvent(event); + cl_command_queue execQueue; + if (((testID_ / NumSizes) == 0) || ((testID_ / NumSizes) == 3)) { + execQueue = cmdQueue0_; + } else { + execQueue = cmdQueue1_; + } + + for (int i = 0; i < NumIter; ++i) { + timer.Reset(); + timer.Start(); + + if (subTest == 0) { + p2p_copy_(execQueue, buffers_[0], buffers_[1], 0, 0, BufferSize, 0, + nullptr, nullptr); + } else { + p2p_copy_(execQueue, buffers_[1], buffers_[0], 0, 0, BufferSize, 0, + nullptr, nullptr); + } + _wrapper->clFinish(execQueue); + timer.Stop(); + double cur = timer.GetElapsedTime(); + if (i == 0) { + sec = cur; + } else { + sec = std::min(cur, sec); + } + } + memset(buffer, 0x20, BufferSize); + if (subTest == 0) { + error_ = _wrapper->clEnqueueReadBuffer(cmdQueue1_, buffers_[1], CL_TRUE, 0, + BufferSize, buffer, 0, NULL, NULL); + } else { + error_ = _wrapper->clEnqueueReadBuffer(cmdQueue0_, buffers_[0], CL_TRUE, 0, + BufferSize, buffer, 0, NULL, NULL); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed!"); + + cl_uint cmp_value = (subTest == 0) ? 0xEBEBEBEB : 0x23232323; + for (int c = 0; c < NumChunks; ++c) { + for (cl_uint i = 0; i < ChunkSize; ++i) { + if (buffer[c * ChunkSize + i] != cmp_value) { + CHECK_RESULT(true, "Validation failed!"); + } + } + } + delete[] buffer; + delete[] buffer2; + + cl_uint* p2p = ((subTest == 0) ? &num_p2p_0_ : &num_p2p_1_); + static const char* MemTypeStr[] = {"Visible ", "Remote ", "Invisible", + "Staging"}; + _perfInfo = (float)BufferSize / ((float)sec * 1000.f * 1000.f * 1000.f); + std::stringstream str; + if ((testID_ / (2 * NumSizes)) == 0) { + str << "Write dev" << ((subTest == 0) ? 0 : 1) << "->dev" + << ((subTest == 0) ? 1 : 0) << ((*p2p != 0) ? " " : " ") << "("; + } else { + str << "Read dev" << ((subTest == 0) ? 1 : 0) << "<-dev" + << ((subTest == 0) ? 0 : 1) << ((*p2p != 0) ? " " : " ") << "("; + } + str.width(2); + str << BufferSize / (1000 * 1000); + str << " MB " + << ") transfer speed (GB/s):"; + testDescString = str.str(); +#endif +} + +unsigned int OCLP2PBuffer::close(void) { +#ifdef CL_VERSION_2_0 + if (!failed_) { + if (cmdQueue0_ != nullptr) { + _wrapper->clReleaseCommandQueue(cmdQueue0_); + } + if (cmdQueue1_ != nullptr) { + _wrapper->clReleaseCommandQueue(cmdQueue1_); + } + if (context0_ != nullptr) { + _wrapper->clReleaseContext(context0_); + } + if (context1_ != nullptr) { + _wrapper->clReleaseContext(context1_); + } + } + return OCLTestImp::close(); +#else + return CL_SUCCESS; +#endif +} diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLP2PBuffer.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLP2PBuffer.h new file mode 100644 index 0000000000..b6fc61a478 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLP2PBuffer.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_P2P_BUFFER_H_ +#define _OCL_P2P_BUFFER_H_ + +#include "OCLTestImp.h" + +class OCLP2PBuffer : public OCLTestImp { + public: + OCLP2PBuffer(); + virtual ~OCLP2PBuffer(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; + unsigned int testID_; + cl_ulong maxSize_; + size_t BufferSize; + int NumChunks; + int NumIter; + int NumStages; + cl_context context0_; + cl_context context1_; + cl_command_queue cmdQueue0_; + cl_command_queue cmdQueue1_; + cl_uint num_p2p_0_; + cl_uint num_p2p_1_; +#ifdef CL_VERSION_2_0 + clEnqueueCopyBufferP2PAMD_fn p2p_copy_; +#endif +}; + +#endif // _OCL_LIQUID_FLASH_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPartialWrkgrp.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPartialWrkgrp.cpp new file mode 100644 index 0000000000..1e897bafe1 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPartialWrkgrp.cpp @@ -0,0 +1,292 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPartialWrkgrp.h" + +#include +#include +#include +#include + +#include "CL/cl.h" + +static const size_t BufSize = 0x1000; + +const static char* strKernel = + "__kernel void fillX(__global int4* out) \n" + "{ \n" + " int id = get_global_id(0); \n" + " out[id].x = id; \n" + "} \n" + " \n" + "__kernel void fillXY(__global int4* out) \n" + "{ \n" + " int id = get_global_id(0) + get_global_id(1) * get_global_size(0); \n" + " out[id].x = get_global_id(0); \n" + " out[id].y = get_global_id(1); \n" + "} \n" + " \n" + "__kernel void fillXYZ(__global int4* out) \n" + "{ \n" + " int id = get_global_id(0) + get_global_id(1) * get_global_size(0) + \n" + " get_global_id(2) * get_global_size(0) * get_global_size(1); \n" + " out[id].x = get_global_id(0); \n" + " out[id].y = get_global_id(1); \n" + " out[id].z = get_global_id(2); \n" + "} \n"; + +OCLPartialWrkgrp::OCLPartialWrkgrp() { + _numSubTests = 2; + isOCL2_ = true; +} + +OCLPartialWrkgrp::~OCLPartialWrkgrp() {} + +void OCLPartialWrkgrp::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + _openTest = test; + + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + char version[128]; + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_VERSION, + sizeof(version), version, NULL); + + if (_openTest == 1 && strstr(version, "OpenCL 2.0") == NULL) { + isOCL2_ = false; + return; + } + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + switch (_openTest) { + case 0: + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + break; + case 1: + error_ = _wrapper->clBuildProgram( + program_, 1, &devices_[deviceId], + "-cl-uniform-work-group-size -cl-std=CL2.0", NULL, NULL); + break; + default: + CHECK_RESULT(false, "Invalid test number > _numSubTests"); + return; + } + + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "fillX", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + cl_mem buffer; + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, + BufSize * sizeof(cl_int4), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLPartialWrkgrp::run(void) { + if (!isOCL2_) return; + unsigned int* values; + cl_mem buffer = buffers()[0]; + values = reinterpret_cast(new cl_int4[BufSize]); + + // + // Check unaligned workgroup in X dimension + // + + // Clear destination buffer + memset(values, 0, BufSize * sizeof(cl_int4)); + error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], buffer, + CL_TRUE, 0, BufSize * sizeof(cl_int4), + values, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + size_t gws[1] = {BufSize - 1}; + size_t lws[1] = {256}; + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, lws, 0, NULL, NULL); + + switch (_openTest) { + case 0: + if (error_ != CL_SUCCESS) { + return; + } + error_ = _wrapper->clEnqueueReadBuffer( + cmdQueues_[_deviceId], buffer, CL_TRUE, 0, BufSize * sizeof(cl_int4), + values, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); + + for (size_t x = 0; x < BufSize; ++x) { + if (x == (BufSize - 1)) { + CHECK_RESULT((values[4 * x] != 0), "Comparison failed!"); + } else { + CHECK_RESULT((values[4 * x] != x), "Comparison failed!"); + } + } + break; + case 1: + CHECK_RESULT((error_ != CL_INVALID_WORK_GROUP_SIZE), + "clEnqueueNDRangeKernel(): " + "Expected to fail for non-uniform work group sizes!"); + default: + CHECK_RESULT(false, "Invalid test number > _numSubTests"); + return; + } + + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseKernel() failed"); + + // + // Check unaligned workgroup in X and Y dimensions + // + kernel_ = _wrapper->clCreateKernel(program_, "fillXY", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + // Clear destination buffer + memset(values, 0, BufSize * sizeof(cl_int4)); + error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], buffer, + CL_TRUE, 0, BufSize * sizeof(cl_int4), + values, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + size_t gws2[2] = {0x3f, 0x3f}; + size_t lws2[2] = {16, 16}; + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, + NULL, gws2, lws2, 0, NULL, NULL); + + switch (_openTest) { + case 0: + if (error_ != CL_SUCCESS) { + return; + } + error_ = _wrapper->clEnqueueReadBuffer( + cmdQueues_[_deviceId], buffer, CL_TRUE, 0, BufSize * sizeof(cl_int4), + values, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); + + for (size_t y = 0; y < 0x40; ++y) { + for (size_t x = 0; x < 0x3f; ++x) { + size_t id = x + y * 0x3f; + if (y == 0x3f) { + CHECK_RESULT((values[4 * id] != 0), "Comparison failed!"); + CHECK_RESULT((values[4 * id + 1] != 0), "Comparison failed!"); + } else { + CHECK_RESULT((values[4 * id] != x), "Comparison failed!"); + CHECK_RESULT((values[4 * id + 1] != y), "Comparison failed!"); + } + } + } + break; + case 1: + CHECK_RESULT((error_ != CL_INVALID_WORK_GROUP_SIZE), + "clEnqueueNDRangeKernel(): " + "Expected to fail for non-uniform work group sizes!"); + break; + default: + CHECK_RESULT(false, "Invalid test number > _numSubTests"); + return; + } + + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseKernel() failed"); + + // + // Check unaligned workgroup in X, Y and Z dimensions + // + kernel_ = _wrapper->clCreateKernel(program_, "fillXYZ", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + // Clear destination buffer + memset(values, 0, BufSize * sizeof(cl_int4)); + error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], buffer, + CL_TRUE, 0, BufSize * sizeof(cl_int4), + values, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + size_t gws3[3] = {0xf, 0x10, 0xf}; + size_t lws3[3] = {4, 4, 4}; + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 3, + NULL, gws3, lws3, 0, NULL, NULL); + switch (_openTest) { + case 0: + if (error_ != CL_SUCCESS) { + return; + } + error_ = _wrapper->clEnqueueReadBuffer( + cmdQueues_[_deviceId], buffer, CL_TRUE, 0, BufSize * sizeof(cl_int4), + values, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); + + for (size_t z = 0; z < 0x10; ++z) { + for (size_t y = 0; y < 0x10; ++y) { + for (size_t x = 0; x < 0xf; ++x) { + size_t id = x + y * 0xf + z * 0xf0; + if (z == 0xf) { + CHECK_RESULT((values[4 * id] != 0), "Comparison failed!"); + CHECK_RESULT((values[4 * id + 1] != 0), "Comparison failed!"); + CHECK_RESULT((values[4 * id + 2] != 0), "Comparison failed!"); + } else { + CHECK_RESULT((values[4 * id] != x), "Comparison failed!"); + CHECK_RESULT((values[4 * id + 1] != y), "Comparison failed!"); + CHECK_RESULT((values[4 * id + 2] != z), "Comparison failed!"); + } + } + } + } + break; + case 1: + CHECK_RESULT((error_ != CL_INVALID_WORK_GROUP_SIZE), + "clEnqueueNDRangeKernel(): " + "Expected fail for non-uniform work group sizes!"); + break; + default: + CHECK_RESULT(false, "Invalid test number > _numSubTests"); + return; + } + + delete[] values; +} + +unsigned int OCLPartialWrkgrp::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPartialWrkgrp.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPartialWrkgrp.h new file mode 100644 index 0000000000..20666e157f --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPartialWrkgrp.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PARTIAL_WRKGRP_H_ +#define _OCL_PARTIAL_WRKGRP_H_ + +#include "OCLTestImp.h" + +class OCLPartialWrkgrp : public OCLTestImp { + public: + OCLPartialWrkgrp(); + virtual ~OCLPartialWrkgrp(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool isOCL2_; +}; + +#endif // _OCL_PARTIAL_WRKGRP_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp new file mode 100644 index 0000000000..dd434701b5 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp @@ -0,0 +1,798 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPerfCounters.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "Timer.h" + +#ifdef WIN_OS +#define SNPRINTF sprintf_s +#else +#define SNPRINTF snprintf +#endif + +struct PerfCounterInfo { + cl_long blockIdx; //!< Block Index + cl_long counterIdx; //!< Counter Index + cl_long eventIdx; //!< Event Index +}; + +struct DeviceCounterInfo { + const char *deviceName_; //!< Device name + unsigned int devId_; //!< Device id + PerfCounterInfo perfCounter_[2]; //!< Perforamnce counter array +}; + +static const DeviceCounterInfo DeviceInfo[]{ + // GFX10 + {"gfx1000", + 10, + {{15, 0, 4}, {77, 1, 2}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l, + // reg 0, BigK bank 0 hits} + {"gfx1010", + 10, + {{15, 0, 4}, {77, 1, 2}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l, + // reg 0, BigK bank 0 hits} + {"gfx1011", + 10, + {{15, 0, 4}, {77, 1, 2}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l, + // reg 0, BigK bank 0 hits} + {"gfx1012", + 10, + {{15, 0, 4}, {77, 1, 2}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l, + // reg 0, BigK bank 0 hits} + // GFX9 + {"gfx900", + 9, + {{14, 0, 4}, {97, 1, 2}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l, + // reg 0, BigK bank 0 hits} + {"gfx901", + 9, + {{14, 0, 4}, {97, 1, 2}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l, + // reg 0, BigK bank 0 hits} + {"gfx902", + 9, + {{14, 0, 4}, {97, 1, 2}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l, + // reg 0, BigK bank 0 hits} + {"gfx903", + 9, + {{14, 0, 4}, {97, 1, 2}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l, + // reg 0, BigK bank 0 hits} + {"gfx904", + 9, + {{14, 0, 4}, {97, 1, 2}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l, + // reg 0, BigK bank 0 hits} + {"gfx905", + 9, + {{14, 0, 4}, {97, 1, 2}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l, + // reg 0, BigK bank 0 hits} + {"gfx906", + 9, + {{14, 0, 4}, {97, 1, 2}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l, + // reg 0, BigK bank 0 hits} + {"gfx907", + 9, + {{14, 0, 4}, {97, 1, 2}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {MCVML2_l, + // reg 0, BigK bank 0 hits} + // Sea Islands, GFX8 + {"Bonaire", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Hawaii", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Maui", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Casper", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Spectre", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Slimer", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Spooky", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Kalindi", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Mullins", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Iceland", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Tonga", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Bermuda", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Fiji", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Carrizo", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Ellesmere", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Baffin", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Stoney", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"gfx804", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"gfx803", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Bristol Ridge", + 0, + {{14, 0, 4}, {9, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + // Southern Islands + {"Tahiti", + 0, + {{10, 0, 4}, {5, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Pitcairn", + 0, + {{10, 0, 4}, {5, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Capeverde", + 0, + {{10, 0, 4}, {5, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Oland", + 0, + {{10, 0, 4}, {5, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} + {"Hainan", + 0, + {{10, 0, 4}, {5, 0, 3}}}, // {SQ, reg 0, SQ_PERF_SEL_WAVES}, {GRBM, reg 0, + // GRBM_PERF_SEL_CP_BUSY} +}; +const int DeviceCounterSize = sizeof(DeviceInfo) / sizeof(DeviceCounterInfo); + +static const char *sha256_kernel = + "typedef uint UINT;\n" + "\n" + "#define VECTOR_LEN 1\n" + "\n" + "#ifdef LITTLE_E\n" + "\n" + "inline UINT byteswap(UINT x)\n" + "{\n" + " UINT res = 0;\n" + " \n" + " for (uint i=0; i<4; i++)\n" + " {\n" + " res <<= 8;\n" + " res |= (x & 0xff);\n" + " x >>= 8;\n" + " }\n" + " \n" + " return res;\n" + "}\n" + "\n" + "#else\n" + "\n" + "inline UINT byteswap(const UINT x)\n" + "{\n" + " return x;\n" + "}\n" + "\n" + "#endif\n" + "\n" + "\n" + "void sha256_step( const UINT data[16], UINT *state )\n" + "{\n" + " UINT W[64], temp1, temp2;\n" + " UINT A, B, C, D, E, F, G, H;\n" + "\n" + " for( int i = 0; i < 16; i++)\n" + " {\n" + " W[i] = byteswap(data[i]);\n" + " }\n" + "\n" + "#define SHR(x,n) ((x & 0xFFFFFFFF) >> n)\n" + "#define ROTR(x,n) (SHR(x,n) | (x << (32 - n)))\n" + "\n" + "#define S0(x) (ROTR(x, 7) ^ ROTR(x,18) ^ SHR(x, 3))\n" + "#define S1(x) (ROTR(x,17) ^ ROTR(x,19) ^ SHR(x,10))\n" + "\n" + "#define S2(x) (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))\n" + "#define S3(x) (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25))\n" + "\n" + "#define F0(x,y,z) ((x & y) | (z & (x | y)))\n" + "#define F1(x,y,z) (z ^ (x & (y ^ z)))\n" + "\n" + "#define R(t) \\\n" + "( \\\n" + " W[t] = S1(W[t - 2]) + W[t - 7] + \\\n" + " S0(W[t - 15]) + W[t - 16] \\\n" + ")\n" + "\n" + "#define P(a,b,c,d,e,f,g,h,x,K) \\\n" + "{ \\\n" + " temp1 = h + S3(e) + F1(e,f,g) + K + x; \\\n" + " temp2 = S2(a) + F0(a,b,c); \\\n" + " d += temp1; h = temp1 + temp2; \\\n" + "}\n" + "\n" + " A = state[0];\n" + " B = state[1];\n" + " C = state[2];\n" + " D = state[3];\n" + " E = state[4];\n" + " F = state[5];\n" + " G = state[6];\n" + " H = state[7];\n" + "\n" + " P( A, B, C, D, E, F, G, H, W[ 0], 0x428A2F98 );\n" + " P( H, A, B, C, D, E, F, G, W[ 1], 0x71374491 );\n" + " P( G, H, A, B, C, D, E, F, W[ 2], 0xB5C0FBCF );\n" + " P( F, G, H, A, B, C, D, E, W[ 3], 0xE9B5DBA5 );\n" + " P( E, F, G, H, A, B, C, D, W[ 4], 0x3956C25B );\n" + " P( D, E, F, G, H, A, B, C, W[ 5], 0x59F111F1 );\n" + " P( C, D, E, F, G, H, A, B, W[ 6], 0x923F82A4 );\n" + " P( B, C, D, E, F, G, H, A, W[ 7], 0xAB1C5ED5 );\n" + " P( A, B, C, D, E, F, G, H, W[ 8], 0xD807AA98 );\n" + " P( H, A, B, C, D, E, F, G, W[ 9], 0x12835B01 );\n" + " P( G, H, A, B, C, D, E, F, W[10], 0x243185BE );\n" + " P( F, G, H, A, B, C, D, E, W[11], 0x550C7DC3 );\n" + " P( E, F, G, H, A, B, C, D, W[12], 0x72BE5D74 );\n" + " P( D, E, F, G, H, A, B, C, W[13], 0x80DEB1FE );\n" + " P( C, D, E, F, G, H, A, B, W[14], 0x9BDC06A7 );\n" + " P( B, C, D, E, F, G, H, A, W[15], 0xC19BF174 );\n" + " P( A, B, C, D, E, F, G, H, R(16), 0xE49B69C1 );\n" + " P( H, A, B, C, D, E, F, G, R(17), 0xEFBE4786 );\n" + " P( G, H, A, B, C, D, E, F, R(18), 0x0FC19DC6 );\n" + " P( F, G, H, A, B, C, D, E, R(19), 0x240CA1CC );\n" + " P( E, F, G, H, A, B, C, D, R(20), 0x2DE92C6F );\n" + " P( D, E, F, G, H, A, B, C, R(21), 0x4A7484AA );\n" + " P( C, D, E, F, G, H, A, B, R(22), 0x5CB0A9DC );\n" + " P( B, C, D, E, F, G, H, A, R(23), 0x76F988DA );\n" + " P( A, B, C, D, E, F, G, H, R(24), 0x983E5152 );\n" + " P( H, A, B, C, D, E, F, G, R(25), 0xA831C66D );\n" + " P( G, H, A, B, C, D, E, F, R(26), 0xB00327C8 );\n" + " P( F, G, H, A, B, C, D, E, R(27), 0xBF597FC7 );\n" + " P( E, F, G, H, A, B, C, D, R(28), 0xC6E00BF3 );\n" + " P( D, E, F, G, H, A, B, C, R(29), 0xD5A79147 );\n" + " P( C, D, E, F, G, H, A, B, R(30), 0x06CA6351 );\n" + " P( B, C, D, E, F, G, H, A, R(31), 0x14292967 );\n" + " P( A, B, C, D, E, F, G, H, R(32), 0x27B70A85 );\n" + " P( H, A, B, C, D, E, F, G, R(33), 0x2E1B2138 );\n" + " P( G, H, A, B, C, D, E, F, R(34), 0x4D2C6DFC );\n" + " P( F, G, H, A, B, C, D, E, R(35), 0x53380D13 );\n" + " P( E, F, G, H, A, B, C, D, R(36), 0x650A7354 );\n" + " P( D, E, F, G, H, A, B, C, R(37), 0x766A0ABB );\n" + " P( C, D, E, F, G, H, A, B, R(38), 0x81C2C92E );\n" + " P( B, C, D, E, F, G, H, A, R(39), 0x92722C85 );\n" + " P( A, B, C, D, E, F, G, H, R(40), 0xA2BFE8A1 );\n" + " P( H, A, B, C, D, E, F, G, R(41), 0xA81A664B );\n" + " P( G, H, A, B, C, D, E, F, R(42), 0xC24B8B70 );\n" + " P( F, G, H, A, B, C, D, E, R(43), 0xC76C51A3 );\n" + " P( E, F, G, H, A, B, C, D, R(44), 0xD192E819 );\n" + " P( D, E, F, G, H, A, B, C, R(45), 0xD6990624 );\n" + " P( C, D, E, F, G, H, A, B, R(46), 0xF40E3585 );\n" + " P( B, C, D, E, F, G, H, A, R(47), 0x106AA070 );\n" + " P( A, B, C, D, E, F, G, H, R(48), 0x19A4C116 );\n" + " P( H, A, B, C, D, E, F, G, R(49), 0x1E376C08 );\n" + " P( G, H, A, B, C, D, E, F, R(50), 0x2748774C );\n" + " P( F, G, H, A, B, C, D, E, R(51), 0x34B0BCB5 );\n" + " P( E, F, G, H, A, B, C, D, R(52), 0x391C0CB3 );\n" + " P( D, E, F, G, H, A, B, C, R(53), 0x4ED8AA4A );\n" + " P( C, D, E, F, G, H, A, B, R(54), 0x5B9CCA4F );\n" + " P( B, C, D, E, F, G, H, A, R(55), 0x682E6FF3 );\n" + " P( A, B, C, D, E, F, G, H, R(56), 0x748F82EE );\n" + " P( H, A, B, C, D, E, F, G, R(57), 0x78A5636F );\n" + " P( G, H, A, B, C, D, E, F, R(58), 0x84C87814 );\n" + " P( F, G, H, A, B, C, D, E, R(59), 0x8CC70208 );\n" + " P( E, F, G, H, A, B, C, D, R(60), 0x90BEFFFA );\n" + " P( D, E, F, G, H, A, B, C, R(61), 0xA4506CEB );\n" + " P( C, D, E, F, G, H, A, B, R(62), 0xBEF9A3F7 );\n" + " P( B, C, D, E, F, G, H, A, R(63), 0xC67178F2 );\n" + "\n" + " state[0] += A;\n" + " state[1] += B;\n" + " state[2] += C;\n" + " state[3] += D;\n" + " state[4] += E;\n" + " state[5] += F;\n" + " state[6] += G;\n" + " state[7] += H;\n" + "}\n" + "\n" + "\n" + "#define choose_temp(x) ((x)/16)\n" + "\n" + "#define STORE_TO_TEMP(i) tb[((i)/16)][((i)%16)]\n" + "\n" + "\n" + "__kernel void CryptThread(__global const uint *buffer, __global uint " + "*state, const uint blockLen, const uint foo)\n" + "{\n" + " const uint init[8] = {\n" + " 0x6a09e667,\n" + " 0xbb67ae85,\n" + " 0x3c6ef372,\n" + " 0xa54ff53a,\n" + " 0x510e527f,\n" + " 0x9b05688c,\n" + " 0x1f83d9ab,\n" + " 0x5be0cd19\n" + " };\n" + " \n" + " const uint id = get_global_id(0);\n" + " uint len = blockLen;\n" + " uint i, j;\n" + " const uint startPosInDWORDs = (len*id*foo)/4;\n" + " const uint msgLenInBitsl = len * 8;\n" + " const uint msgLenInBitsh = (len) >> (32-3);\n" + " UINT localState[8];\n" + "\n" + " for (j=0; j<8; j++) {\n" + " localState[j] = init[j];\n" + " }\n" + "\n" + " i = 0;\n" + " while (len >=64)\n" + " {\n" + " UINT data[16];\n" + " for (j=0; j<16; j++) {\n" + " data[j] = buffer[j + startPosInDWORDs + i];\n" + " }\n" + "\n" + " sha256_step(data, localState);\n" + " i += 16;\n" + " len -= 64;\n" + " }\n" + "\n" + " len /= 4;\n" + "\n" + " UINT tb[2][16];\n" + "\n" + " for (j=0; jclEnqueueMapBuffer( + cmd_queue_, buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL, + &error_); + + if (error_ != CL_SUCCESS) { + printf("\nError code : %d\n", error_); + } else { + for (unsigned int i = 0; i < width_; i++) data[i] = val; + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, + NULL, NULL); + if (error_ == CL_SUCCESS) retVal = true; + } + return retVal; +} + +void OCLPerfCounters::checkData(cl_mem buffer) { + unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer( + cmd_queue_, buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL, + &error_); + for (unsigned int i = 0; i < width_; i++) { + } + error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL, + NULL); +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLPerfCounters::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id *devices = NULL; + cl_device_id device = NULL; + _crcword = 0; + conversion = 1.0f; + _deviceId = deviceId; + _openTest = test; + + context_ = 0; + cmd_queue_ = 0; + program_ = 0; + kernel_ = 0; + inBuffer_ = 0; + outBuffer_ = 0; + num_input_buf_ = 1; + num_output_buf_ = 1; + blockSize_ = 1024; + isAMD = false; + + if (type_ != CL_DEVICE_TYPE_GPU) { + char msg[256]; + SNPRINTF(msg, sizeof(msg), "No GPU devices present. Exiting!\t"); + testDescString = msg; + return; + } + + width_ = 22347776; + // We compute a square domain + bufSize_ = width_ * sizeof(cl_uint); + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms-1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); + // Runtime returns an error when no GPU devices are present instead of just + // returning 0 devices + // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + // Choose platform with GPU devices + if (num_devices > 0) { + if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { + isAMD = true; + } + // platform = platforms[_platformIndex]; + // break; + } +#if 0 + } +#endif + delete platforms; + } + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, + "Couldn't find platform with GPU devices, cannot proceed"); + + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + + global_device = device; + + context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, + &error_); + CHECK_RESULT(context_ == 0, "clCreateContext failed"); + + char charbuf[1024]; + size_t retsize; + error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024, + charbuf, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL); + CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); + + inBuffer_ = new cl_mem[4]; + outBuffer_ = new cl_mem[4]; + + for (int i = 0; i < num_input_buf_; ++i) { + inBuffer_[i] = + _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(inBuffer_[i] == 0, "clCreateBuffer(inBuffer) failed"); + bool result = setData(inBuffer_[i], 0xdeadbeef); + CHECK_RESULT(result != true, "clEnqueueMapBuffer buffer failed"); + } + + for (int i = 0; i < num_output_buf_; ++i) { + outBuffer_[i] = + _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_); + CHECK_RESULT(outBuffer_[i] == 0, "clCreateBuffer(outBuffer) failed"); + bool result = setData(outBuffer_[i], 0xdeadbeef); + CHECK_RESULT(result != true, "clEnqueueMapBuffer buffer failed"); + } + + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, (const char **)&sha256_kernel, NULL, &error_); + CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); + + const char *buildOps = NULL; + if (isAMD) { + // Enable caching + buildOps = "-fno-alias"; + } + error_ = _wrapper->clBuildProgram(program_, 1, &device, buildOps, NULL, NULL); + + if (error_ != CL_SUCCESS) { + cl_int intError; + char log[16384]; + intError = + _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, + 16384 * sizeof(char), log, NULL); + printf("Build error -> %s\n", log); + + CHECK_RESULT(0, "clBuildProgram failed"); + } + kernel_ = _wrapper->clCreateKernel(program_, "CryptThread", &error_); + CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), + (void *)&inBuffer_[0]); + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), + (void *)&outBuffer_[0]); + error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint), + (void *)&blockSize_); + // Foo is not part of the original test, this can be used to see how much of + // the performance is limited by fetch. Set foo to 0 and all threads will + // fetch the same 1k block. This way they will all be in cache and hit max + // fetch speed. + unsigned int foo = 1; + error_ = _wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_uint), (void *)&foo); +} + +void OCLPerfCounters::run(void) { + // Test runs only on GPU + if (type_ != CL_DEVICE_TYPE_GPU) return; + + size_t global = bufSize_ / blockSize_; + // 32 gives the best result due to memory thrashing. Need to optimize and + // give feedback to SiSoft. + size_t local = 64; + char buf[256]; + + size_t global_work_size[1] = {global}; + size_t local_work_size[1] = {local}; + + cl_int err = 0; + cl_perfcounter_amd perfCounter; + cl_perfcounter_property properties[4][2]; + cl_event perfEvent; + cl_ulong result; + char deviceName[1024]; + + properties[0][0] = CL_PERFCOUNTER_GPU_BLOCK_INDEX; + properties[1][0] = CL_PERFCOUNTER_GPU_COUNTER_INDEX; + properties[2][0] = CL_PERFCOUNTER_GPU_EVENT_INDEX; + properties[3][0] = CL_PERFCOUNTER_NONE; + + err = _wrapper->clGetDeviceInfo(global_device, CL_DEVICE_NAME, 1024, + deviceName, NULL); + CHECK_RESULT(err != CL_SUCCESS, "clGetDeviceInfo failed"); + + // Begin: to be removed when crash on Kabini is fixed + if (strcmp(deviceName, "Kalindi") == 0) { + char msg[256]; + SNPRINTF(msg, sizeof(msg), "Exiting as device is Kabini!\t"); + testDescString = msg; + return; + } + // End: to be removed when crash on Kabini is fixed + + bool found = false; + unsigned int devId = 0; + for (int idx = 0; !found && idx < DeviceCounterSize; idx++) { + if (strcmp(deviceName, DeviceInfo[idx].deviceName_) == 0) { + devId = DeviceInfo[idx].devId_; + properties[0][1] = DeviceInfo[idx].perfCounter_[_openTest].blockIdx; + properties[1][1] = DeviceInfo[idx].perfCounter_[_openTest].counterIdx; + properties[2][1] = DeviceInfo[idx].perfCounter_[_openTest].eventIdx; + found = true; + } + } + + if (!found) { + char msg[256]; + SNPRINTF(msg, sizeof(msg), "Unsupported device(%s) for the test!\t", + deviceName); + testDescString = msg; + return; + } + + perfCounter = + _wrapper->clCreatePerfCounterAMD(global_device, &properties[0][0], &err); + CHECK_RESULT(err != CL_SUCCESS, "Create PerfCounter failed\n"); + + // set clock mode + cl_set_device_clock_mode_input_amd setClockModeInput; + setClockModeInput.clock_mode = CL_DEVICE_CLOCK_MODE_PROFILING_AMD; + cl_set_device_clock_mode_output_amd setClockModeOutput = {}; + _wrapper->clSetDeviceClockModeAMD(global_device, setClockModeInput, + &setClockModeOutput); + + _wrapper->clEnqueueBeginPerfCounterAMD(cmd_queue_, 1, &perfCounter, 0, NULL, + NULL); + + for (unsigned int i = 0; i < MAX_ITERATIONS; i++) { + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), + (void *)&inBuffer_[i % num_input_buf_]); + error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), + (void *)&outBuffer_[i % num_output_buf_]); + + error_ = _wrapper->clEnqueueNDRangeKernel( + cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size, + (const size_t *)local_work_size, 0, NULL, NULL); + } + + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueNDRangeKernel failed"); + + _wrapper->clEnqueueEndPerfCounterAMD(cmd_queue_, 1, &perfCounter, 0, NULL, + &perfEvent); + clWaitForEvents(1, &perfEvent); + + // set clock mode to default + setClockModeInput.clock_mode = CL_DEVICE_CLOCK_MODE_DEFAULT_AMD; + _wrapper->clSetDeviceClockModeAMD(global_device, setClockModeInput, + &setClockModeOutput); + + _wrapper->clGetPerfCounterInfoAMD(perfCounter, CL_PERFCOUNTER_DATA, + sizeof(cl_ulong), &result, NULL); + + err = _wrapper->clReleasePerfCounterAMD(perfCounter); + CHECK_RESULT(err != CL_SUCCESS, "Release PerfCounter failed\n"); + + switch (_openTest) { + case 0: + SNPRINTF(buf, sizeof(buf), "SQ Number of Waves: %lu ", (long)result); + break; + case 1: + if (devId >= 9) { + SNPRINTF(buf, sizeof(buf), "BigK Bank0 hits: %lu ", (long)result); + } else { + SNPRINTF(buf, sizeof(buf), "GRBM CP Busy: %lu ", (long)result); + } + break; + } + + testDescString = buf; + CHECK_RESULT(!(result > 0), "Perf counter value read is zero!\n"); +} + +unsigned int OCLPerfCounters::close(void) { + _wrapper->clFinish(cmd_queue_); + + if (inBuffer_) { + for (int i = 0; i < num_input_buf_; ++i) { + error_ = _wrapper->clReleaseMemObject(inBuffer_[i]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(inBuffer_) failed"); + } + delete[] inBuffer_; + } + if (outBuffer_) { + for (int i = 0; i < num_output_buf_; ++i) { + error_ = _wrapper->clReleaseMemObject(outBuffer_[i]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseMemObject(outBuffer_) failed"); + } + delete[] outBuffer_; + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } + if (cmd_queue_) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (context_) { + error_ = _wrapper->clReleaseContext(context_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + + return _crcword; +} diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPerfCounters.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPerfCounters.h new file mode 100644 index 0000000000..89751e3e41 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPerfCounters.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLTestImp.h" + +class OCLPerfCounters : public OCLTestImp { + public: + OCLPerfCounters(); + virtual ~OCLPerfCounters(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + std::string shader_; + bool setData(cl_mem buffer, unsigned int data); + void checkData(cl_mem buffer); + cl_context context_; + cl_command_queue cmd_queue_; + cl_program program_; + cl_kernel kernel_; + cl_mem* inBuffer_; + cl_mem* outBuffer_; + cl_int num_input_buf_; + cl_int num_output_buf_; + cl_int error_; + unsigned int width_; + unsigned int bufSize_; + unsigned int blockSize_; + static const unsigned int MAX_ITERATIONS = 1; + bool isAMD; +}; diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPersistent.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPersistent.cpp new file mode 100644 index 0000000000..5151ae9c12 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPersistent.cpp @@ -0,0 +1,139 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPersistent.h" + +#include +#include +#include +#include +#include + +const static char* strKernel = + "__kernel void persistentImage( write_only image2d_t source){ \n" + " int tidX = get_global_id(0);\n" + " int tidY = get_global_id(1);\n" + " write_imagei( source, (int2)( tidX, tidY ),(int4)( tidX, tidY,0,0 ) " + ");\n" + "}\n"; + +OCLPersistent::OCLPersistent() : clImage_(0) { _numSubTests = 1; } + +OCLPersistent::~OCLPersistent() {} + +void OCLPersistent::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + if (_errorFlag) return; + + // Build the kernel + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed!"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed!"); + + kernel_ = _wrapper->clCreateKernel(program_, "persistentImage", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed!"); + cl_image_format format; + format.image_channel_data_type = CL_SIGNED_INT32; + format.image_channel_order = CL_RG; + cl_image_desc desc = {0}; + desc.image_type = CL_MEM_OBJECT_IMAGE2D; + desc.image_width = c_dimSize; + desc.image_height = c_dimSize; + desc.image_depth = 1; + desc.image_array_size = 1; + // CL_MEM_USE_PERSISTENT_MEM_AMD + clImage_ = + clCreateImage(context_, CL_MEM_USE_PERSISTENT_MEM_AMD | CL_MEM_WRITE_ONLY, + &format, &desc, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateImage() failed"); +} + +void OCLPersistent::run(void) { + _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clImage_); + + size_t dimSizes[] = {c_dimSize, c_dimSize}; + + size_t origin[] = {0, 0, 0}; + size_t region[] = {c_dimSize, c_dimSize, 1}; + size_t pitch, slice; + cl_event event; + error_ = _wrapper->clEnqueueNDRangeKernel( + cmdQueues_[_deviceId], kernel_, 2, NULL, dimSizes, NULL, 0, NULL, NULL); + error_ = _wrapper->clEnqueueMarkerWithWaitList(cmdQueues_[_deviceId], 0, NULL, + &event); + + _wrapper->clFlush(cmdQueues_[_deviceId]); + + cl_uint status; + _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(cl_uint), &status, NULL); + while (status != CL_COMPLETE) { + _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(cl_uint), &status, NULL); + } + + unsigned int* image = (unsigned int*)_wrapper->clEnqueueMapImage( + cmdQueues_[_deviceId], clImage_, CL_TRUE, CL_MAP_READ, origin, region, + &pitch, &slice, 0, NULL, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueMapImage() failed"); + + bool result = validateImage(image, pitch, c_dimSize); + CHECK_RESULT(!result, "Validation failed!"); + + _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], clImage_, image, 0, + NULL, NULL); +} + +unsigned int OCLPersistent::close(void) { + _wrapper->clReleaseMemObject(clImage_); + + return OCLTestImp::close(); +} + +bool OCLPersistent::validateImage(unsigned int* image, size_t pitch, + unsigned int dimSize) { + unsigned int x, y; + int idx = 0; + for (y = 0; y < dimSize; y++) { + for (x = 0; x < dimSize; x++) { + if ((image[idx] != x) || (image[idx + 1] != y)) { + printf("Failed at coordinate (%5d, %5d) - R:%d, G:%d value\n", x, y, + image[idx], image[idx + 1]); + return false; + } + idx += 2; + } + image += pitch / sizeof(int); + idx = 0; + } + return true; +} diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPersistent.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPersistent.h new file mode 100644 index 0000000000..a7585db0a9 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPersistent.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PERSISTENT_H_ +#define _OCL_PERSISTENT_H_ + +#include "OCLTestImp.h" + +class OCLPersistent : public OCLTestImp { + public: + OCLPersistent(); + virtual ~OCLPersistent(); + static const unsigned int c_dimSize = 510; + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId); + virtual void run(void); + virtual unsigned int close(void); + + private: + //////////////////// + // test functions // + //////////////////// + + bool validateImage(unsigned int* image, size_t pitch, unsigned int dimSize); + ///////////////////// + // private members // + ///////////////////// + + // CL identifiers + cl_mem clImage_; +}; + +#endif // _OCL_GL_BUFFER_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPinnedMemory.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPinnedMemory.cpp new file mode 100644 index 0000000000..c67d8ed620 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPinnedMemory.cpp @@ -0,0 +1,218 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPinnedMemory.h" + +#ifdef _WIN32 +#include +// Pick up from OCLSVM +size_t getTotalSystemMemory(); +#else +#include +size_t getTotalSystemMemory() { + struct sysinfo info; + sysinfo(&info); + return info.totalram; +} +#endif + +#include +#include +#include + +OCLPinnedMemory::OCLPinnedMemory() { _numSubTests = 2; } + +OCLPinnedMemory::~OCLPinnedMemory() {} + +void OCLPinnedMemory::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_ERROR(error_, "Error opening test"); + _openTest = test; + host_memory_ = nullptr; + +#ifdef _WIN32 + // Observed failures on Win7 + if (!IsWindows8OrGreater()) { + printf("Test requires Win10, skipping...\n"); + _openTest = -1; + return; + } +#endif + + cl_int status; + + // Observed failures with Carrizo on GSL path + cl_bool is_apu; + status = clGetDeviceInfo(devices_[deviceId], CL_DEVICE_HOST_UNIFIED_MEMORY, + sizeof(cl_bool), &is_apu, nullptr); + CHECK_ERROR(status, "clGetDeviceInfo failed."); + if (is_apu) { + printf("Test not supported for apus, skipping...\n"); + _openTest = -1; + return; + } + + cl_uint address_bits; + status = clGetDeviceInfo(devices_[deviceId], CL_DEVICE_ADDRESS_BITS, + sizeof(cl_uint), &address_bits, nullptr); + CHECK_ERROR(status, "clGetDeviceInfo failed."); + if (address_bits < 64u) { + printf("GPU VA range size below 4GB, skipping...\n"); + _openTest = -1; + return; + } + + row_size_ = getTotalSystemMemory(); + if (row_size_ <= (1ull << 32u)) { + printf("System memory below 4GB, skipping...\n"); + _openTest = -1; + return; + } + row_size_ *= ratio_; + row_size_ = floor(sqrt(row_size_)); + row_size_ = (row_size_ + row_data_size_ - 1) & ~(row_data_size_ - 1); + + pin_size_ = row_size_ * row_size_ / row_data_size_; + host_memory_ = new row_data_t[pin_size_]; +} + +void OCLPinnedMemory::runNoPrepinnedMemory() { + cl_int status; + + row_data_t* tmp = new row_data_t[row_size_]; + std::iota(tmp, tmp + row_size_, 0); + std::fill_n(host_memory_, pin_size_, 0); + + cl_mem tmp_buffer = clCreateBuffer(context_, CL_MEM_USE_HOST_PTR, + row_size_ * row_data_size_, tmp, &status); + CHECK_ERROR(status, "clCreateBuffer failed."); + cl_mem buffer = clCreateBuffer(context_, CL_MEM_READ_WRITE, + row_size_ * row_data_size_, nullptr, &status); + CHECK_ERROR(status, "clCreateBuffer failed."); + + status = clEnqueueCopyBuffer(cmdQueues_[_deviceId], tmp_buffer, buffer, 0, 0, + row_size_ * row_data_size_, 0, nullptr, nullptr); + CHECK_ERROR(status, "clEnqueueCopyBuffer failed."); + clFinish(cmdQueues_[_deviceId]); + + size_t buffer_offset[3] = {0, 0, 0}; + size_t host_offset[3] = {0, 0, 0}; + size_t region[3] = {row_data_size_, row_size_, 1}; + + status = clEnqueueReadBufferRect( + cmdQueues_[_deviceId], buffer, CL_TRUE, buffer_offset, host_offset, + region, 0, 0, row_size_, 0, host_memory_, 0, nullptr, nullptr); + CHECK_ERROR(status, "clEnqueueReadBufferRect failed."); + status = clFinish(cmdQueues_[_deviceId]); + CHECK_ERROR(status, "clFinish failed."); + + for (uint64_t i = 0; i < row_size_; i++) { + if (tmp[i] != host_memory_[i * row_size_ / row_data_size_]) { + status = -1; + break; + } + } + + CHECK_RESULT(status == -1, "Error when reading data."); + + status = clReleaseMemObject(buffer); + CHECK_ERROR(status, "clReleaseMemObject failed."); + status = clReleaseMemObject(tmp_buffer); + CHECK_ERROR(status, "clReleaseMemObject failed."); + delete[] tmp; +} + +void OCLPinnedMemory::runPrepinnedMemory() { + cl_int status; + + row_data_t* tmp = new row_data_t[row_size_]; + std::iota(tmp, tmp + row_size_, 0); + std::fill_n(host_memory_, pin_size_, 0); + + cl_mem tmp_buffer = clCreateBuffer(context_, CL_MEM_USE_HOST_PTR, + row_size_ * row_data_size_, tmp, &status); + CHECK_ERROR(status, "clCreateBuffer failed."); + cl_mem buffer = clCreateBuffer(context_, CL_MEM_READ_WRITE, + row_size_ * row_data_size_, nullptr, &status); + CHECK_ERROR(status, "clCreateBuffer failed."); + + status = clEnqueueCopyBuffer(cmdQueues_[_deviceId], tmp_buffer, buffer, 0, 0, + row_size_ * row_data_size_, 0, nullptr, nullptr); + CHECK_ERROR(status, "clEnqueueCopyBuffer failed."); + + cl_mem pinned_buffer = + clCreateBuffer(context_, CL_MEM_USE_HOST_PTR, pin_size_ * row_data_size_, + host_memory_, &status); + CHECK_ERROR(status, "clCreateBuffer failed."); + + clEnqueueMapBuffer(cmdQueues_[_deviceId], pinned_buffer, CL_TRUE, + CL_MAP_READ | CL_MAP_WRITE, 0, pin_size_ * row_data_size_, + 0, nullptr, nullptr, &status); + CHECK_ERROR(status, "clEnqueueMapBuffer failed."); + + size_t buffer_offset[3] = {0, 0, 0}; + size_t host_offset[3] = {0, 0, 0}; + size_t region[3] = {row_data_size_, row_size_, 1}; + + status = clEnqueueReadBufferRect( + cmdQueues_[_deviceId], buffer, CL_TRUE, buffer_offset, host_offset, + region, 0, 0, row_size_, 0, host_memory_, 0, nullptr, nullptr); + CHECK_ERROR(status, "clEnqueueReadBufferRect failed."); + + for (uint64_t i = 0; i < row_size_; i++) { + if (tmp[i] != host_memory_[i * row_size_ / row_data_size_]) { + status = -1; + break; + } + } + + CHECK_RESULT(status == -1, "Error when reading data."); + + status = clEnqueueUnmapMemObject(cmdQueues_[_deviceId], pinned_buffer, + host_memory_, 0, nullptr, nullptr); + CHECK_ERROR(status, "clEnqueueUnmap failed.") + status = clFinish(cmdQueues_[_deviceId]); + CHECK_ERROR(status, "clFinish failed."); + + status = clReleaseMemObject(pinned_buffer); + CHECK_ERROR(status, "clReleaseMemObject failed."); + status = clReleaseMemObject(buffer); + CHECK_ERROR(status, "clReleaseMemObject failed."); + status = clReleaseMemObject(tmp_buffer); + CHECK_ERROR(status, "clReleaseMemObject failed."); + delete[] tmp; +} + +void OCLPinnedMemory::run() { + switch (_openTest) { + case 0: + runNoPrepinnedMemory(); + break; + case 1: + runPrepinnedMemory(); + break; + } +} + +unsigned int OCLPinnedMemory::close() { + delete[] host_memory_; + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPinnedMemory.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPinnedMemory.h new file mode 100644 index 0000000000..bc3d633b6e --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPinnedMemory.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PINNED_MEMORY_H_ +#define _OCL_PINNED_MEMORY_H_ + +#include + +#include "OCLTestImp.h" + +class OCLPinnedMemory : public OCLTestImp { + public: + OCLPinnedMemory(); + ~OCLPinnedMemory(); + + void open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) override; + void run() override; + unsigned int close() override; + + private: + void runNoPrepinnedMemory(); + void runPrepinnedMemory(); + + static constexpr const float ratio_ = 0.4f; + using row_data_t = uint64_t; + + row_data_t* host_memory_; + size_t row_data_size_ = sizeof(row_data_t); + size_t row_size_; + size_t pin_size_; +}; + +#endif // _OCL_PINNED_MEMORY_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPlatformAtomics.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPlatformAtomics.cpp new file mode 100644 index 0000000000..c1abadf7fb --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPlatformAtomics.cpp @@ -0,0 +1,182 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLPlatformAtomics.h" + +#include +#include +#include + +#include "CL/cl.h" + +const static char* strKernel = + "__kernel void test_atomic_kernel(volatile __global atomic_int *pSync, " + "volatile __global atomic_int *ptr, int numIterations)\n" + "{ " + " \n" + " while(atomic_load_explicit(pSync, memory_order_acquire, " + "memory_scope_all_svm_devices) == 0); \n" + " for (int i = 0; i < numIterations; i++) { " + " \n" + " atomic_fetch_add_explicit(ptr, 1, memory_order_acq_rel, " + "memory_scope_all_svm_devices); \n" + " } " + " \n" + "} " + " \n"; + +OCLPlatformAtomics::OCLPlatformAtomics() { + _numSubTests = 1; + failed_ = false; + svmCaps_ = 0; +} + +OCLPlatformAtomics::~OCLPlatformAtomics() {} + +void OCLPlatformAtomics::open(unsigned int test, char* units, + double& conversion, unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + size_t param_size = 0; + char* strVersion = 0; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0, + 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strVersion = new char[param_size]; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, + param_size, strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (strVersion[7] < '2') { + failed_ = true; + return; + } + delete strVersion; + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "test_atomic_kernel", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); +} + +static int AtomicLoad(volatile cl_int* object) { +#if defined(_MSC_VER) || defined(__INTEL_COMPILER) + return InterlockedExchangeAdd((volatile long*)object, 0); +#elif defined(__GNUC__) + return __sync_add_and_fetch(object, 0); +#else + printf("Atomic load not supported, aborting..."); + return 0; +#endif +} + +static int AtomicIncrement(volatile cl_int* object) { +#if defined(_MSC_VER) || defined(__INTEL_COMPILER) + return _InterlockedIncrement((volatile long*)object); +#elif defined(__GNUC__) + return __sync_fetch_and_add(object, 1); +#endif + printf("Atomic increment not supported, aborting..."); + return 0; +} + +void OCLPlatformAtomics::run(void) { + if (failed_) return; + +#ifdef CL_VERSION_2_0 + error_ = + _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_SVM_CAPABILITIES, + sizeof(svmCaps_), &svmCaps_, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clGetDeviceInfo() failed"); + + if (!(svmCaps_ & CL_DEVICE_SVM_ATOMICS)) { + printf("SVM atomics not supported, skipping test...\n"); + return; + } + + volatile cl_int* pSyncBuf = (volatile cl_int*)_wrapper->clSVMAlloc( + context_, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, + sizeof(cl_int), 0); + CHECK_RESULT(!pSyncBuf, "clSVMAlloc() failed"); + *pSyncBuf = 0; + + volatile cl_int* pAtomicBuf = (volatile cl_int*)_wrapper->clSVMAlloc( + context_, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, + sizeof(cl_int), 0); + CHECK_RESULT(!pAtomicBuf, "clSVMAlloc() failed"); + *pAtomicBuf = 0; + + error_ = + _wrapper->clSetKernelArgSVMPointer(kernel_, 0, (const void*)pSyncBuf); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArgSVMPointer() failed"); + + error_ = + _wrapper->clSetKernelArgSVMPointer(kernel_, 1, (const void*)pAtomicBuf); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArgSVMPointer() failed"); + + cl_int numIterations = 0x100000; + error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_int), &numIterations); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + size_t globalWorkSize[1] = {1}; + error_ = + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL, + globalWorkSize, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + clFlush(cmdQueues_[_deviceId]); + + AtomicIncrement(pSyncBuf); + + // wait until we see some activity from a device (try to run host side + // simultaneously). + while (AtomicLoad(pAtomicBuf /*, memory_order_relaxed*/) == 0) + ; + + for (int i = 0; i < numIterations; i++) { + AtomicIncrement(pAtomicBuf); + } + + error_ = _wrapper->clFinish(cmdQueues_[_deviceId]); + CHECK_ERROR(error_, "clFinish() failed"); + + int expected = numIterations * 2; + CHECK_RESULT(*pAtomicBuf != expected, "Expected: 0x%x, found: 0x%x", expected, + *pAtomicBuf); + + _wrapper->clSVMFree(context_, (void*)pSyncBuf); + _wrapper->clSVMFree(context_, (void*)pAtomicBuf); +#endif +} + +unsigned int OCLPlatformAtomics::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLPlatformAtomics.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPlatformAtomics.h new file mode 100644 index 0000000000..c728fb6c36 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLPlatformAtomics.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_PLATFORM_ATOMICS_H_ +#define _OCL_PLATFORM_ATOMICS_H_ + +#include "OCLTestImp.h" + +class OCLPlatformAtomics : public OCLTestImp { + public: + OCLPlatformAtomics(); + virtual ~OCLPlatformAtomics(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + bool failed_; + unsigned long long svmCaps_; +}; + +#endif // _OCL_KERNEL_BINARY_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLProgramScopeVariables.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLProgramScopeVariables.cpp new file mode 100644 index 0000000000..4baf3db684 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLProgramScopeVariables.cpp @@ -0,0 +1,274 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLProgramScopeVariables.h" + +#include "CL/cl.h" + +OCLProgramScopeVariables::OCLProgramScopeVariables() { _numSubTests = 3; } + +OCLProgramScopeVariables::~OCLProgramScopeVariables() {} + +void OCLProgramScopeVariables::open(unsigned int test, char* units, + double& conversion, unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "error_ opening test"); + silentFailure = false; + _openTest = test; + size_t param_size = 0; + program_ = 0; + kernel1_ = kernel2_ = 0; + char* strVersion = 0; + error_ = _wrapper->clGetDeviceInfo( + devices_[_deviceId], CL_DEVICE_OPENCL_C_VERSION, 0, 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed"); + strVersion = (char*)malloc(param_size); + error_ = + _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_OPENCL_C_VERSION, + param_size, strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed"); + if (strVersion[9] < '2') { + printf("\nOpenCL C 2.0 not supported\n"); + silentFailure = true; + } + free(strVersion); +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLProgramScopeVariables::run(void) { + if (silentFailure) return; + switch (_openTest) { + case 0: + test0(); + break; + case 1: + test1(); + break; + case 2: + test2(); + break; + } + return; +} + +void OCLProgramScopeVariables::test0(void) { + const char* kernel_str = + "global int g[1000] = {0}; \n\ + __kernel void test1 (global unsigned int * A) \n\ + { \n\ + int id = get_global_id(0); \n\ + g[id] = id; \n\ + } \n\ + __kernel void test2 (global unsigned int * A) \n\ + { \n\ + int id = get_global_id(0); \n\ + A[id] = g[id]; \n\ + } \n"; + const size_t arrSize = 1000; + cl_uint* output_arr = (cl_uint*)malloc(arrSize * sizeof(cl_uint)); + cl_mem buffer = _wrapper->clCreateBuffer( + context_, CL_MEM_READ_WRITE, arrSize * sizeof(cl_uint), 0, &error_); + buffers_.push_back(buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed"); + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed"); + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char log[400]; + _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, 400, log, 0); + printf("\n\n%s\n\n", log); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed"); + kernel1_ = _wrapper->clCreateKernel(program_, "test1", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel1 failed"); + kernel2_ = _wrapper->clCreateKernel(program_, "test2", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel2 failed"); + error_ = _wrapper->clSetKernelArg(kernel1_, 0, sizeof(cl_mem), + (void*)&buffers_[0]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed"); + error_ = _wrapper->clSetKernelArg(kernel2_, 0, sizeof(cl_mem), + (void*)&buffers_[0]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed"); + cl_event evt; + size_t global_work_size = arrSize; + error_ = + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel1_, 1, NULL, + &global_work_size, NULL, 0, NULL, &evt); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + error_ = + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel2_, 1, NULL, + &global_work_size, NULL, 0, NULL, &evt); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel"); + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0], + CL_TRUE, 0, sizeof(cl_uint) * arrSize, + output_arr, 1, &evt, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed"); + bool bResult = true; + for (unsigned int i = 0; i < arrSize; ++i) { + if (output_arr[i] != i) { + bResult = false; + break; + } + } + free(output_arr); + CHECK_RESULT((bResult == false), "Program Scope Variables - test0 failed"); +} + +void OCLProgramScopeVariables::test1(void) { + const char* kernel_str = + "global int temp = 0; \n\ + __kernel void test1 (global unsigned int * A) \n\ + { \n\ + int id = get_global_id(0); \n\ + if (id == 0) temp = 55; \n\ + } \n\ + __kernel void test2 (global unsigned int * A) \n\ + { \n\ + int id = get_global_id(0); \n\ + if (id == 0) A[0] = temp; \n\ + } \n"; + cl_uint* output_arr = (cl_uint*)malloc(sizeof(cl_uint)); + cl_mem buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + sizeof(cl_uint), 0, &error_); + buffers_.push_back(buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed"); + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed"); + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char log[400]; + _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, 400, log, 0); + printf("\n\n%s\n\n", log); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed"); + kernel1_ = _wrapper->clCreateKernel(program_, "test1", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel1 failed"); + kernel2_ = _wrapper->clCreateKernel(program_, "test2", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel2 failed"); + error_ = _wrapper->clSetKernelArg(kernel1_, 0, sizeof(cl_mem), + (void*)&buffers_[0]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed"); + error_ = _wrapper->clSetKernelArg(kernel2_, 0, sizeof(cl_mem), + (void*)&buffers_[0]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed"); + cl_event evt; + size_t global_work_size = 1; + error_ = + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel1_, 1, NULL, + &global_work_size, NULL, 0, NULL, &evt); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + error_ = + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel2_, 1, NULL, + &global_work_size, NULL, 0, NULL, &evt); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel"); + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0], + CL_TRUE, 0, sizeof(cl_uint), + output_arr, 1, &evt, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed"); + bool bResult = (output_arr[0] == 55); + free(output_arr); + CHECK_RESULT((bResult == false), "Program Scope Variables - test1 failed"); +} + +void OCLProgramScopeVariables::test2(void) { + const char* kernel_str = + "global int temp = 0; \n\ + global int* ptr[] = {&temp}; \n\ + __kernel void test1 (global unsigned int * A) \n\ + { \n\ + int id = get_global_id(0); \n\ + if (id == 0) temp = 65; \n\ + } \n\ + __kernel void test2 (global unsigned int * A) \n\ + { \n\ + int id = get_global_id(0); \n\ + if (id == 0) A[0] = *ptr[0]; \n\ + } \n"; + cl_uint* output_arr = (cl_uint*)malloc(sizeof(cl_uint)); + cl_mem buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + sizeof(cl_uint), 0, &error_); + buffers_.push_back(buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed"); + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &kernel_str, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource failed"); + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], + "-cl-std=CL2.0", NULL, NULL); + if (error_ != CL_SUCCESS) { + char log[400]; + _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, 400, log, 0); + printf("\n\n%s\n\n", log); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram failed"); + kernel1_ = _wrapper->clCreateKernel(program_, "test1", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel1 failed"); + kernel2_ = _wrapper->clCreateKernel(program_, "test2", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel2 failed"); + error_ = _wrapper->clSetKernelArg(kernel1_, 0, sizeof(cl_mem), + (void*)&buffers_[0]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed"); + error_ = _wrapper->clSetKernelArg(kernel2_, 0, sizeof(cl_mem), + (void*)&buffers_[0]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed"); + cl_event evt; + size_t global_work_size = 1; + error_ = + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel1_, 1, NULL, + &global_work_size, NULL, 0, NULL, &evt); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + error_ = + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel2_, 1, NULL, + &global_work_size, NULL, 0, NULL, &evt); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel"); + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[0], + CL_TRUE, 0, sizeof(cl_uint), + output_arr, 1, &evt, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer failed"); + bool bResult = (output_arr[0] == 65); + free(output_arr); + CHECK_RESULT((bResult == false), "Program Scope Variables - test2 failed"); +} + +unsigned int OCLProgramScopeVariables::close(void) { + if (kernel1_) { + error_ = _wrapper->clReleaseKernel(kernel1_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel1 failed"); + kernel1_ = 0; + } + if (kernel2_) { + error_ = _wrapper->clReleaseKernel(kernel2_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel2 failed"); + kernel2_ = 0; + } + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLProgramScopeVariables.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLProgramScopeVariables.h new file mode 100644 index 0000000000..e0dc0429dd --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLProgramScopeVariables.h @@ -0,0 +1,46 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_ProgramScopeVariables_H_ +#define _OCL_ProgramScopeVariables_H_ + +#include "OCLTestImp.h" + +class OCLProgramScopeVariables : public OCLTestImp { + public: + OCLProgramScopeVariables(); + virtual ~OCLProgramScopeVariables(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + void test0(void); + void test1(void); + void test2(void); + bool silentFailure; + cl_kernel kernel1_; + cl_kernel kernel2_; +}; + +#endif // _OCL_ProgramScopeVariables_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLRTQueue.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLRTQueue.cpp new file mode 100644 index 0000000000..73d1915309 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLRTQueue.cpp @@ -0,0 +1,415 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLRTQueue.h" + +#include +#include +#include +#include + +#include "CL/cl.h" + +static const size_t Iterations = 0x100; +static const size_t IterationDivider = 2; +static const size_t MaxBuffers = IterationDivider; +static const size_t BufSize = 0x800000; + +const static char* strKernel = + "__kernel void factorial(__global uint* out) \n" + "{ \n" + " uint id = get_global_id(0); \n" + " uint factorial = 1; \n" + " for (uint i = 1; i < (id / 0x400); ++i) \n" + " { \n" + " factorial *= i; \n" + " } \n" + " out[id] = factorial; \n" + "} \n"; + +OCLRTQueue::OCLRTQueue() : rtQueue_(NULL), rtQueue1_(NULL), kernel2_(NULL) { +#ifndef CL_VERSION_2_0 + _numSubTests = 0; + testID_ = 0; + failed_ = false; +#else + _numSubTests = 2; + testID_ = 0; + failed_ = false; +#endif +} + +OCLRTQueue::~OCLRTQueue() {} + +void OCLRTQueue::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { +#ifdef CL_VERSION_2_0 + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + testID_ = test; + size_t param_size = 0; + char* strVersion = 0; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0, + 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strVersion = new char[param_size]; + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, + param_size, strVersion, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (strVersion[7] < '2') { + failed_ = true; + return; + } + cl_uint rtQueues; +#define CL_DEVICE_MAX_REAL_TIME_COMPUTE_QUEUES_AMD 0x404D +#define CL_DEVICE_MAX_REAL_TIME_COMPUTE_UNITS_AMD 0x404E + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], + CL_DEVICE_MAX_REAL_TIME_COMPUTE_QUEUES_AMD, + sizeof(rtQueues), &rtQueues, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (rtQueues < 2) { + failed_ = true; + return; + } + + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], + CL_DEVICE_MAX_REAL_TIME_COMPUTE_UNITS_AMD, + sizeof(rtCUs_), &rtCUs_, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], + CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(maxCUs_), &maxCUs_, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "factorial", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + cl_mem buffer; + for (size_t i = 0; i < MaxBuffers; ++i) { + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + BufSize * sizeof(cl_uint), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + } + + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR, + BufSize * sizeof(cl_uint), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); +#endif +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLRTQueue::run(void) { +#ifdef CL_VERSION_2_0 + if (failed_) { + return; + } + + if (testID_ == 0) { + cu_ = rtCUs_ >> 1; + } else { + cu_ = rtCUs_; + } + // Create a real time queue +#define CL_QUEUE_REAL_TIME_COMPUTE_UNITS_AMD 0x404f + const cl_queue_properties cprops[] = { + CL_QUEUE_PROPERTIES, static_cast(0), + CL_QUEUE_REAL_TIME_COMPUTE_UNITS_AMD, cu_, 0}; + rtQueue_ = _wrapper->clCreateCommandQueueWithProperties( + context_, devices_[_deviceId], cprops, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateCommandQueueWithProperties() failed"); + +#define CL_QUEUE_MEDIUM_PRIORITY_AMD 0x4050 + const cl_queue_properties cprops2[] = {CL_QUEUE_PROPERTIES, + static_cast(0), + CL_QUEUE_MEDIUM_PRIORITY_AMD, 0, 0}; + rtQueue1_ = _wrapper->clCreateCommandQueueWithProperties( + context_, devices_[_deviceId], cprops2, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateCommandQueueWithProperties() failed"); + + void* values; + CPerfCounter timer; + cl_mem mapBuffer = buffers()[MaxBuffers]; + + values = _wrapper->clEnqueueMapBuffer( + cmdQueues_[_deviceId], mapBuffer, true, (CL_MAP_READ | CL_MAP_WRITE), 0, + BufSize * sizeof(cl_uint), 0, NULL, NULL, &error_); + + cl_mem buffer = buffers()[0]; + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + // SubTest: 1 + size_t gws[1] = {BufSize}; + size_t x; + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + _wrapper->clFinish(cmdQueues_[_deviceId]); + + timer.Reset(); + timer.Start(); + for (x = 0; x < 1; x++) { + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + _wrapper->clFinish(cmdQueues_[_deviceId]); + + timer.Stop(); + + double sec = timer.GetElapsedTime(); + // Buffer read bandwidth in GB/s + double perf = ((double)BufSize * sizeof(cl_uint) * x * (double)(1e-09)) / sec; + + printf("\n Generic Queue(CUs: %d) Time: %.3fs\n", maxCUs_, sec); + + // SubTest: 2 + error_ = _wrapper->clEnqueueNDRangeKernel(rtQueue_, kernel_, 1, NULL, gws, + NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + _wrapper->clFinish(rtQueue_); + + timer.Reset(); + timer.Start(); + for (x = 0; x < 1; x++) { + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(rtQueue_, kernel_, 1, NULL, gws, + NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + _wrapper->clFinish(rtQueue_); + + timer.Stop(); + + sec = timer.GetElapsedTime(); + // Buffer read bandwidth in GB/s + perf = ((double)BufSize * sizeof(cl_uint) * x * (double)(1e-09)) / sec; + + printf(" RT Queue0 (CUs: %2d) Time: %.3fs\n", cu_, sec); + + // SubTest: 2 + + error_ = _wrapper->clEnqueueNDRangeKernel(rtQueue1_, kernel_, 1, NULL, gws, + NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + _wrapper->clFinish(rtQueue1_); + + timer.Reset(); + timer.Start(); + for (x = 0; x < 1; x++) { + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(rtQueue1_, kernel_, 1, NULL, gws, + NULL, 0, NULL, NULL); + + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + _wrapper->clFinish(rtQueue1_); + + timer.Stop(); + + sec = timer.GetElapsedTime(); + // Buffer read bandwidth in GB/s + perf = ((double)BufSize * sizeof(cl_uint) * x * (double)(1e-09)) / sec; + + printf(" Medium Queue (CUs: %2d) Time: %.3fs\n", + maxCUs_ - cu_, sec); + + // SubTest: 3 + timer.Reset(); + timer.Start(); + for (x = 0; x < 1; x++) { + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + _wrapper->clFinish(cmdQueues_[_deviceId]); + + timer.Stop(); + + sec = timer.GetElapsedTime(); + // Buffer read bandwidth in GB/s + perf = ((double)BufSize * sizeof(cl_uint) * x * (double)(1e-09)) / sec; + + printf(" Generic Queue(CUs: %d) Time: %.3fs\n", maxCUs_ - cu_, + sec); + + // SubTest: 4 + for (x = 0; x < Iterations / 10; x++) { + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + _wrapper->clFlush(cmdQueues_[_deviceId]); + timer.Reset(); + timer.Start(); + for (x = 0; x < 1; x++) { + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(rtQueue_, kernel_, 1, NULL, gws, + NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + _wrapper->clFinish(rtQueue_); + + timer.Stop(); + _wrapper->clFinish(cmdQueues_[_deviceId]); + + sec = timer.GetElapsedTime(); + // Buffer read bandwidth in GB/s + perf = ((double)BufSize * sizeof(cl_uint) * x * (double)(1e-09)) / sec; + + printf(" Async RT(CUs: %d) + Generic(CUs: %d) Time: %.3fs\n", cu_, + maxCUs_ - cu_, sec); + + // SubTest: 5 + for (x = 0; x < Iterations / 10; x++) { + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + _wrapper->clFlush(cmdQueues_[_deviceId]); + timer.Reset(); + timer.Start(); + for (x = 0; x < 1; x++) { + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(rtQueue1_, kernel_, 1, NULL, gws, + NULL, 0, NULL, NULL); + + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + _wrapper->clFinish(rtQueue1_); + + timer.Stop(); + _wrapper->clFinish(cmdQueues_[_deviceId]); + + sec = timer.GetElapsedTime(); + // Buffer read bandwidth in GB/s + perf = ((double)BufSize * sizeof(cl_uint) * x * (double)(1e-09)) / sec; + + printf(" Async Medium(CUs: %d) + Generic(CUs: %d) Time: %.3fs\n", + maxCUs_ - cu_, maxCUs_ - cu_, sec); + + // SubTest: 6 + for (x = 0; x < Iterations / 10; x++) { + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + _wrapper->clFlush(cmdQueues_[_deviceId]); + timer.Reset(); + timer.Start(); + for (x = 0; x < 1; x++) { + error_ = _wrapper->clEnqueueNDRangeKernel(rtQueue_, kernel_, 1, NULL, gws, + NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + _wrapper->clFlush(rtQueue_); + for (x = 0; x < 1; x++) { + error_ = _wrapper->clEnqueueNDRangeKernel(rtQueue1_, kernel_, 1, NULL, gws, + NULL, 0, NULL, NULL); + + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + } + + _wrapper->clFlush(rtQueue1_); + _wrapper->clFinish(rtQueue_); + _wrapper->clFinish(rtQueue1_); + timer.Stop(); + _wrapper->clFlush(cmdQueues_[_deviceId]); + + sec = timer.GetElapsedTime(); + // Buffer read bandwidth in GB/s + perf = ((double)BufSize * sizeof(cl_uint) * x * (double)(1e-09)) / sec; + + printf( + " Async RT0(CUs: %d) + Medium(CUs: %d) + Generic(CUs: %d) Time: %.3fs\n", + cu_, maxCUs_ - cu_, maxCUs_ - cu_, sec); + error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], mapBuffer, + values, 0, NULL, NULL); + _wrapper->clFinish(cmdQueues_[_deviceId]); +#endif +} + +unsigned int OCLRTQueue::close(void) { +#ifdef CL_VERSION_2_0 + if (NULL != rtQueue_) { + _wrapper->clReleaseCommandQueue(rtQueue_); + } + if (NULL != rtQueue1_) { + _wrapper->clReleaseCommandQueue(rtQueue1_); + } + if (NULL != kernel2_) { + _wrapper->clReleaseKernel(kernel2_); + } + + return OCLTestImp::close(); +#else + return CL_SUCCESS; +#endif +} diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLRTQueue.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLRTQueue.h new file mode 100644 index 0000000000..b4f98dc5ae --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLRTQueue.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_RT_QUEUE_H_ +#define _OCL_RT_QUEUE_H_ + +#include "OCLTestImp.h" + +class OCLRTQueue : public OCLTestImp { + public: + OCLRTQueue(); + virtual ~OCLRTQueue(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + cl_command_queue rtQueue_; + cl_command_queue rtQueue1_; + cl_kernel kernel2_; + unsigned int testID_; + bool failed_; + cl_uint cu_; + cl_uint maxCUs_; + cl_uint rtCUs_; +}; + +#endif // _OCL_RT_QUEUE_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLReadWriteImage.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLReadWriteImage.cpp new file mode 100644 index 0000000000..02dc2be4ca --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLReadWriteImage.cpp @@ -0,0 +1,372 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLReadWriteImage.h" + +#include +#include +#include + +#include +#ifdef ATI_OS_LINUX +#include +#include +#endif + +#include "CL/cl.h" + +const static size_t imageSize = 4; +const static size_t MaxSubTests = 4; + +static const char *rgba8888_kernel_read = + "\n" + "__kernel void read_rgba8888(read_only image2d_t srcimg, __global uchar4 " + "*dst, sampler_t sampler)\n" + "{\n" + " int tid_x = get_global_id(0);\n" + " int tid_y = get_global_id(1);\n" + " int indx = tid_y * get_image_width(srcimg) + tid_x;\n" + " float4 color;\n" + "\n" + " color = read_imagef(srcimg, sampler, (int2)(tid_x, tid_y)) * 255.0f;\n" + " dst[indx] = convert_uchar4_rte(color);\n" + "\n" + "}\n"; + +static const char *rgba8888_kernel_write = + "\n" + "__kernel void write_rgba8888(__global unsigned char *src, write_only " + "image2d_t dstimg)\n" + "{\n" + " int tid_x = get_global_id(0);\n" + " int tid_y = get_global_id(1);\n" + " int indx = tid_y * get_image_width(dstimg) + tid_x;\n" + " float4 color;\n" + "\n" + " indx *= 4;\n" + " color = (float4)((float)src[indx+0], (float)src[indx+1], " + "(float)src[indx+2], (float)src[indx+3]);\n" + " color /= (float4)(255.0f, 255.0f, 255.0f, 255.0f);\n" + " write_imagef(dstimg, (int2)(tid_x, tid_y), color);\n" + "\n" + "}\n"; + +OCLReadWriteImage::OCLReadWriteImage() { + _numSubTests = MaxSubTests; + failed_ = false; + imageWidth = imageSize; + imageHeight = imageSize; + imageDepth = imageSize; +} + +OCLReadWriteImage::~OCLReadWriteImage() {} + +bool OCLReadWriteImage::verifyImageData(unsigned char *inputImageData, + unsigned char *output, size_t width, + size_t height) { + for (unsigned int i = 0; i < 4 * width * height; i++) { + if (output[i] != inputImageData[i]) { + printf( + "Verification failed at byte %u in the output image => %x != %x " + "[reference]\n", + i, output[i], inputImageData[i]); + return false; + } + } + return true; +} +void OCLReadWriteImage::open(unsigned int test, char *units, double &conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + testID_ = test; + + cl_bool imageSupport; + size_t size; + for (size_t i = 0; i < deviceCount_; ++i) { + _wrapper->clGetDeviceInfo(devices_[i], CL_DEVICE_IMAGE_SUPPORT, + sizeof(imageSupport), &imageSupport, &size); + if (!imageSupport) { + failed_ = true; + return; + } + } + +#ifdef ATI_OS_LINUX + failed_ = true; + return; +#endif + if (test == 1) { + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, &rgba8888_kernel_read, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, + 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "read_rgba8888", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + } else if ((test == 2) || (test == 3)) { + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, &rgba8888_kernel_write, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, + 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "write_rgba8888", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + } + + cl_mem memory; + cl_image_format imgageFormat; + imgageFormat.image_channel_order = CL_RGBA; + imgageFormat.image_channel_data_type = CL_UNORM_INT8; + bufferSize = imageWidth * imageHeight * 4 * sizeof(unsigned char); + + memory = _wrapper->clCreateImage2D(context_, CL_MEM_READ_WRITE, &imgageFormat, + imageWidth, imageHeight, 0, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateImage() failed"); + + buffers_.push_back(memory); + + if ((test == 1) || (test == 2) || (test == 3)) { + memory = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, bufferSize, + NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(memory); + } +} + +static void CL_CALLBACK notify_callback(const char *errinfo, + const void *private_info, size_t cb, + void *user_data) {} + +void OCLReadWriteImage::run(void) { + if (failed_) { + return; + } + + const unsigned int inputImageData[imageSize][imageSize] = { + {0xc0752fac, 0x67c3fb43, 0xf215d309, 0xd8465724}, + {0xc13a8c58, 0xae5727e6, 0x19a55158, 0x9409484d}, + {0xc5f3d073, 0xc0af4ffe, 0xb1d86352, 0x93931df3}, + {0xc120a78e, 0x207fb909, 0x97f4ca1f, 0x72cbfea3}}; + + unsigned char *outputPtr = (unsigned char *)malloc(bufferSize); + + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {imageWidth, imageHeight, 1}; + bool validation; + size_t threads[2]; + + switch (testID_) { + case 0: // ImageWrite (w/ sDMA) and ImageRead (w/ sDMA) + error_ = _wrapper->clEnqueueWriteImage(cmdQueues_[_deviceId], buffers_[0], + true, origin, region, 0, 0, + inputImageData, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteImage() failed"); + + error_ = _wrapper->clEnqueueReadImage(cmdQueues_[_deviceId], buffers_[0], + true, origin, region, 0, 0, + outputPtr, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadImage() failed"); + + validation = verifyImageData((unsigned char *)&inputImageData, outputPtr, + imageWidth, imageHeight); + if (validation) { + printf("ImageWrite (w/ sDMA) -> ImageRead (w/ sDMA) passed!\n"); + } else { + CHECK_RESULT(true, + "ImageWrite (w/ sDMA) -> ImageRead (w/ sDMA) failed!\n"); + } + break; + case 1: // ImageWrite (w/ sDMA) and ImageRead (w/ kernel) + error_ = _wrapper->clEnqueueWriteImage(cmdQueues_[_deviceId], buffers_[0], + true, origin, region, 0, 0, + inputImageData, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteImage() failed"); + + cl_sampler sampler; + sampler = _wrapper->clCreateSampler(context_, CL_FALSE, + CL_ADDRESS_CLAMP_TO_EDGE, + CL_FILTER_NEAREST, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateSampler failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof buffers_[0], + &buffers_[0]); + error_ |= clSetKernelArg(kernel_, 1, sizeof buffers_[1], &buffers_[1]); + error_ |= clSetKernelArg(kernel_, 2, sizeof sampler, &sampler); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed\n"); + + threads[0] = (unsigned int)imageWidth; + threads[1] = (unsigned int)imageHeight; + + error_ = + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, + NULL, threads, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[1], + CL_TRUE, 0, bufferSize, outputPtr, + 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); + + validation = verifyImageData((unsigned char *)&inputImageData, outputPtr, + imageWidth, imageHeight); + if (validation) { + printf("ImageWrite (w/ sDMA) -> ImageRead (w/ kernel) passed!\n"); + } else { + CHECK_RESULT(true, + "ImageWrite (w/ sDMA) -> ImageRead (w/ kernel) failed!\n"); + } + + break; + case 2: // ImageWrite (w/ kernel) and ImageRead (w/ sDMA) + error_ = _wrapper->clEnqueueWriteBuffer( + cmdQueues_[_deviceId], buffers_[1], CL_TRUE, 0, bufferSize, + inputImageData, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof buffers_[1], + &buffers_[1]); + error_ |= clSetKernelArg(kernel_, 1, sizeof buffers_[0], &buffers_[0]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed\n"); + + threads[0] = (unsigned int)imageWidth; + threads[1] = (unsigned int)imageHeight; + + error_ = + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, + NULL, threads, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + error_ = _wrapper->clEnqueueReadImage(cmdQueues_[_deviceId], buffers_[0], + true, origin, region, 0, 0, + outputPtr, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadImage() failed"); + + validation = verifyImageData((unsigned char *)&inputImageData, outputPtr, + imageWidth, imageHeight); + if (validation) { + printf("ImageWrite (w/ kernel) -> ImageRead (w/ sDMA) passed!\n"); + } else { + CHECK_RESULT(true, + "ImageWrite (w/ kernel) -> ImageRead (w/ sDMA) failed!\n"); + } + break; + case 3: // ImageWrite (w/ kernel) and ImageRead (w/ kernel) + error_ = _wrapper->clEnqueueWriteBuffer( + cmdQueues_[_deviceId], buffers_[1], CL_TRUE, 0, bufferSize, + inputImageData, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof buffers_[1], + &buffers_[1]); + error_ |= clSetKernelArg(kernel_, 1, sizeof buffers_[0], &buffers_[0]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed\n"); + + threads[0] = (unsigned int)imageWidth; + threads[1] = (unsigned int)imageHeight; + + error_ = + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, + NULL, threads, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + // recreate the program_ to use the read kernel + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, &rgba8888_kernel_read, NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), + "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[_deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, + 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "read_rgba8888", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + sampler = _wrapper->clCreateSampler(context_, CL_FALSE, + CL_ADDRESS_CLAMP_TO_EDGE, + CL_FILTER_NEAREST, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateSampler failed"); + + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof buffers_[0], + &buffers_[0]); + error_ |= clSetKernelArg(kernel_, 1, sizeof buffers_[1], &buffers_[1]); + error_ |= clSetKernelArg(kernel_, 2, sizeof sampler, &sampler); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg failed\n"); + + threads[0] = (unsigned int)imageWidth; + threads[1] = (unsigned int)imageHeight; + + error_ = + _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, + NULL, threads, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[1], + CL_TRUE, 0, bufferSize, outputPtr, + 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); + + validation = verifyImageData((unsigned char *)&inputImageData, outputPtr, + imageWidth, imageHeight); + if (validation) { + printf("ImageWrite (w/ kernel) -> ImageRead (w/ kernel) passed!\n"); + } else { + CHECK_RESULT( + true, "ImageWrite (w/ kernel) -> ImageRead (w/ kernel) failed!\n"); + } + + break; + } + + free(outputPtr); +} + +unsigned int OCLReadWriteImage::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLReadWriteImage.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLReadWriteImage.h new file mode 100644 index 0000000000..c22bc51b93 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLReadWriteImage.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_READ_WRITE_IMAGE_H_ +#define _OCL_READ_WRITE_IMAGE_H_ + +#include "OCLTestImp.h" + +class OCLReadWriteImage : public OCLTestImp { + public: + OCLReadWriteImage(); + virtual ~OCLReadWriteImage(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; + unsigned int testID_; + size_t maxSize_; + size_t imageWidth; + size_t imageHeight; + size_t imageDepth; + size_t bufferSize; + cl_sampler sampler; + bool verifyImageData(unsigned char* inputImageData, unsigned char* output, + size_t width, size_t height); +}; + +#endif // _OCL_READ_WRITE_IMAGE_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLSDI.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSDI.cpp new file mode 100644 index 0000000000..f0081727cc --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSDI.cpp @@ -0,0 +1,515 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLSDI.h" + +#include "Timer.h" +#define NUM_TESTS 6 + +#include + +typedef struct _threadInfo { + int threadID_; + OCLSDI* testObj_; +} ThreadInfo; +const char* kernel_str_ = + "__kernel void test_kernel(global unsigned int * A) \ + { \ + int id = get_global_id(0); \ + A[id] = id + 2;\ + } "; +const char* testNames[NUM_TESTS] = { + "WriteBuffer", "CopyBuffer", "NDRangeKernel", + "MapBuffer", "WriteBufferRect", "CopyImageToBuffer", +}; + +void* ThreadMain(void* data) { + if (data == NULL) { + return 0; + } + ThreadInfo* threadData = (ThreadInfo*)data; + threadData->testObj_->threadEntry(threadData->threadID_); + return NULL; +} + +OCLSDI::OCLSDI() { + // If there are two different gpus in the system, + // we have to test each of them as sender and receiver + _numSubTests = 2 * NUM_TESTS; +} + +OCLSDI::~OCLSDI() {} + +void OCLSDI::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + cl_uint numPlatforms = 0; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + _crcword = 0; + conversion = 1.0f; + program_ = 0; + kernel_ = 0; + srcBuff_ = 0; + _openTest = test % NUM_TESTS; + bufSize_ = 0x10000; + error_ = 0; + markerValue_ = 0x12345; + inputArr_ = 0; + outputArr_ = 0; + success_ = true; + extPhysicalBuff_ = 0; + silentFailure = false; + busAddressableBuff_ = 0; + devices_[0] = devices_[1] = 0; + contexts_[0] = contexts_[1] = 0; + cmd_queues_[0] = cmd_queues_[1] = 0; + image_ = 0; + + inputArr_ = (cl_uint*)malloc(bufSize_); + outputArr_ = (cl_uint*)malloc(bufSize_); + for (unsigned int i = 0; i < (bufSize_ / sizeof(cl_uint)); ++i) { + inputArr_[i] = i + 1; + outputArr_[i] = 0; + } + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(numPlatforms == 0, "clGetPlatformIDs failed"); + error_ = _wrapper->clGetPlatformIDs(1, &platform, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + error_ = _wrapper->clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, + &num_devices); + if (num_devices < 2) { + printf("\nSilent Failure: Two GPUs are required to run OCLSdi test\n"); + silentFailure = true; + return; + } + error_ = + _wrapper->clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 2, devices_, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + if (test >= NUM_TESTS) { + cl_device_id temp = devices_[0]; + devices_[0] = devices_[1]; + devices_[1] = temp; + } + size_t param_size = 0; + char* strExtensions = 0; + error_ = _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_EXTENSIONS, 0, 0, + ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strExtensions = (char*)malloc(param_size); + error_ = _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_EXTENSIONS, + param_size, strExtensions, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (strstr(strExtensions, "cl_amd_bus_addressable_memory") == 0) { + printf( + "\nSilent Failure: cl_amd_bus_addressable_memory extension is not " + "enabled on GPU 0\n"); + silentFailure = true; + free(strExtensions); + return; + } + free(strExtensions); + error_ = _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_EXTENSIONS, 0, 0, + ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strExtensions = (char*)malloc(param_size); + error_ = _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_EXTENSIONS, + param_size, strExtensions, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + if (strstr(strExtensions, "cl_amd_bus_addressable_memory") == 0) { + printf( + "\nSilent Failure: cl_amd_bus_addressable_memory extension is not " + "enabled on GPU 1\n"); + silentFailure = true; + free(strExtensions); + return; + } + free(strExtensions); + deviceNames_ = " ["; + param_size = 0; + char* strDeviceName = 0; + error_ = + _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_NAME, 0, 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strDeviceName = (char*)malloc(param_size); + error_ = _wrapper->clGetDeviceInfo(devices_[1], CL_DEVICE_NAME, param_size, + strDeviceName, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + deviceNames_ = deviceNames_ + strDeviceName; + free(strDeviceName); + error_ = + _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_NAME, 0, 0, ¶m_size); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + strDeviceName = (char*)malloc(param_size); + error_ = _wrapper->clGetDeviceInfo(devices_[0], CL_DEVICE_NAME, param_size, + strDeviceName, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + deviceNames_ = deviceNames_ + "->"; + deviceNames_ = deviceNames_ + strDeviceName; + free(strDeviceName); + deviceNames_ = deviceNames_ + "]"; + cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)platform, 0}; + contexts_[0] = + _wrapper->clCreateContext(props, 1, &devices_[0], 0, 0, &error_); + CHECK_RESULT(contexts_[0] == 0, "clCreateContext failed"); + contexts_[1] = + _wrapper->clCreateContext(props, 1, &devices_[1], 0, 0, &error_); + CHECK_RESULT(contexts_[1] == 0, "clCreateContext failed"); + cmd_queues_[0] = + _wrapper->clCreateCommandQueue(contexts_[0], devices_[0], 0, NULL); + CHECK_RESULT(cmd_queues_[0] == 0, "clCreateCommandQueue failed"); + cmd_queues_[1] = + _wrapper->clCreateCommandQueue(contexts_[1], devices_[1], 0, NULL); + CHECK_RESULT(cmd_queues_[1] == 0, "clCreateCommandQueue failed"); + busAddressableBuff_ = _wrapper->clCreateBuffer( + contexts_[0], CL_MEM_BUS_ADDRESSABLE_AMD, bufSize_, 0, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed"); + error_ = _wrapper->clEnqueueMakeBuffersResidentAMD( + cmd_queues_[0], 1, &busAddressableBuff_, true, &busAddr_, 0, 0, 0); + CHECK_RESULT((error_ != CL_SUCCESS), + "clEnqueueMakeBuffersResidentAMD failed"); + extPhysicalBuff_ = _wrapper->clCreateBuffer( + contexts_[1], CL_MEM_EXTERNAL_PHYSICAL_AMD, bufSize_, &busAddr_, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer failed"); + error_ = _wrapper->clEnqueueWriteSignalAMD(cmd_queues_[1], extPhysicalBuff_, + 0, 0, 0, 0, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteSignalAMD failed"); + error_ = _wrapper->clFinish(cmd_queues_[1]); + CHECK_RESULT(error_, "clFinish failed"); + srcBuff_ = _wrapper->clCreateBuffer(contexts_[1], + CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + bufSize_, inputArr_, &error_); + CHECK_RESULT(error_ != CL_SUCCESS, "clCreateBuffer failed"); + error_ = _wrapper->clEnqueueMigrateMemObjects(cmd_queues_[1], 1, + &extPhysicalBuff_, 0, 0, 0, 0); + CHECK_RESULT(error_, "clEnqueueMigrateMemObjects failed"); + error_ = _wrapper->clFinish(cmd_queues_[1]); + CHECK_RESULT(error_, "clFinish failed"); + error_ = _wrapper->clEnqueueMigrateMemObjects(cmd_queues_[1], 1, &srcBuff_, 0, + 0, 0, 0); + CHECK_RESULT(error_, "clEnqueueMigrateMemObjects failed"); + error_ = _wrapper->clFinish(cmd_queues_[1]); + CHECK_RESULT(error_, "clFinish failed"); + if (_openTest == 2) { + program_ = _wrapper->clCreateProgramWithSource(contexts_[1], 1, + &kernel_str_, NULL, &error_); + CHECK_RESULT(error_, "clCreateProgramWithSource failed"); + error_ = + _wrapper->clBuildProgram(program_, 1, &devices_[1], NULL, NULL, NULL); + if (error_ != CL_SUCCESS) { + char* errorstr; + size_t size; + _wrapper->clGetProgramBuildInfo(program_, devices_[1], + CL_PROGRAM_BUILD_LOG, 0, NULL, &size); + errorstr = new char[size]; + _wrapper->clGetProgramBuildInfo( + program_, devices_[1], CL_PROGRAM_BUILD_LOG, size, errorstr, &size); + printf("\n%s\n", errorstr); + delete[] errorstr; + } + CHECK_RESULT(error_, "clBuildProgram failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "test_kernel", &error_); + CHECK_RESULT(error_, "clCreateKernel failed"); + error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), + (void*)&extPhysicalBuff_); + CHECK_RESULT(error_, "clSetKernelArg failed"); + } + if (_openTest == 5) { + cl_image_format format = {CL_R, CL_UNSIGNED_INT32}; + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE1D; + desc.image_width = bufSize_ / sizeof(cl_uint); + desc.image_height = 0; + desc.image_depth = 0; + desc.image_array_size = 0; + desc.image_row_pitch = 0; + desc.image_slice_pitch = 0; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = (cl_mem)NULL; + image_ = _wrapper->clCreateImage(contexts_[1], CL_MEM_READ_ONLY, &format, + &desc, 0, &error_); + CHECK_RESULT(error_, "clCreateImage failed"); + } +} + +void OCLSDI::run(void) { + if (silentFailure) { + return; + } + ++markerValue_; + OCLutil::Thread threads[2]; + ThreadInfo threadInfo[2]; + threadInfo[0].testObj_ = threadInfo[1].testObj_ = this; + threadInfo[0].threadID_ = 0; + threadInfo[1].threadID_ = 1; + threads[0].create(ThreadMain, &threadInfo[0]); + threads[1].create(ThreadMain, &threadInfo[1]); + threads[0].join(); + threads[1].join(); + char* descString = (char*)malloc(25 + deviceNames_.size()); + sprintf(descString, "%-20s%s", testNames[_openTest], deviceNames_.c_str()); + testDescString = descString; + free(descString); + if (!success_) { + _errorFlag = true; + _crcword += 1; + } +} + +void OCLSDI::threadEntry(int threadID) { + if (silentFailure) { + return; + } + switch (_openTest) { + case 0: + testEnqueueWriteBuffer(threadID); + break; + case 1: + testEnqueueCopyBuffer(threadID); + break; + case 2: + testEnqueueNDRangeKernel(threadID); + break; + case 3: + testEnqueueMapBuffer(threadID); + break; + case 4: + testEnqueueWriteBufferRect(threadID); + break; + case 5: + testEnqueueCopyImageToBuffer(threadID); + break; + } +} + +unsigned int OCLSDI::close(void) { + if (srcBuff_) { + error_ = _wrapper->clReleaseMemObject(srcBuff_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject failed"); + } + if (extPhysicalBuff_) { + error_ = _wrapper->clReleaseMemObject(extPhysicalBuff_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject failed"); + } + if (busAddressableBuff_) { + error_ = _wrapper->clReleaseMemObject(busAddressableBuff_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject failed"); + } + if (cmd_queues_[0]) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queues_[0]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (cmd_queues_[1]) { + error_ = _wrapper->clReleaseCommandQueue(cmd_queues_[1]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, + "clReleaseCommandQueue failed"); + } + if (contexts_[0]) { + error_ = _wrapper->clReleaseContext(contexts_[0]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + if (contexts_[1]) { + error_ = _wrapper->clReleaseContext(contexts_[1]); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); + } + if (program_) { + error_ = _wrapper->clReleaseProgram(program_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); + } + if (kernel_) { + error_ = _wrapper->clReleaseKernel(kernel_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); + } + if (image_) { + error_ = _wrapper->clReleaseMemObject(image_); + CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject failed"); + } + if (inputArr_) { + free(inputArr_); + } + if (outputArr_) { + free(outputArr_); + } + return _crcword; +} + +void OCLSDI::readAndVerifyResult() { + memset(outputArr_, 0, bufSize_); + error_ = _wrapper->clEnqueueWaitSignalAMD(cmd_queues_[0], busAddressableBuff_, + markerValue_, 0, 0, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWaitSignalAMD failed"); + error_ = _wrapper->clEnqueueReadBuffer(cmd_queues_[0], busAddressableBuff_, + CL_TRUE, 0, bufSize_, outputArr_, 0, 0, + NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueReadBuffer failed"); + success_ = (memcmp(inputArr_, outputArr_, bufSize_) == 0); +} + +void OCLSDI::testEnqueueCopyImageToBuffer(int threadID) { + if (threadID == 0) { + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {bufSize_ / sizeof(cl_uint), 1, 1}; + memset(inputArr_, (_openTest + 1), bufSize_); + error_ = + _wrapper->clEnqueueWriteImage(cmd_queues_[1], image_, CL_TRUE, origin, + region, 0, 0, inputArr_, 0, 0, 0); + CHECK_RESULT(error_, "clEnqueueWriteImage failed"); + _wrapper->clFinish(cmd_queues_[1]); + error_ = _wrapper->clEnqueueCopyImageToBuffer( + cmd_queues_[1], image_, extPhysicalBuff_, origin, region, 0, 0, 0, 0); + CHECK_RESULT(error_, "clEnqueueCopyImageToBuffer failed"); + _wrapper->clFinish(cmd_queues_[1]); + error_ = _wrapper->clEnqueueWriteSignalAMD(cmd_queues_[1], extPhysicalBuff_, + markerValue_, 0, 0, 0, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteSignalAMD failed"); + error_ = _wrapper->clFinish(cmd_queues_[1]); + CHECK_RESULT(error_, "clFinish failed"); + } else { + readAndVerifyResult(); + } +} + +void OCLSDI::testEnqueueWriteBufferRect(int threadID) { + size_t width = (size_t)sqrt((float)bufSize_); + size_t bufOrigin[3] = {0, 0, 0}; + size_t hostOrigin[3] = {0, 0, 0}; + size_t region[3] = {width, width, 1}; + if (threadID == 0) { + memset(inputArr_, (_openTest + 1), bufSize_); + error_ = _wrapper->clEnqueueWriteBufferRect( + cmd_queues_[1], extPhysicalBuff_, CL_TRUE, bufOrigin, hostOrigin, + region, width, 0, width, 0, inputArr_, 0, 0, 0); + CHECK_RESULT(error_, "clEnqueueWriteBufferRect failed"); + error_ = _wrapper->clEnqueueWriteSignalAMD(cmd_queues_[1], extPhysicalBuff_, + markerValue_, 0, 0, 0, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteSignalAMD failed"); + error_ = _wrapper->clFinish(cmd_queues_[1]); + CHECK_RESULT(error_, "clFinish failed"); + } else { + memset(outputArr_, 0, bufSize_); + error_ = _wrapper->clEnqueueWaitSignalAMD( + cmd_queues_[0], busAddressableBuff_, markerValue_, 0, 0, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWaitSignalAMD failed"); + error_ = _wrapper->clEnqueueReadBufferRect( + cmd_queues_[0], busAddressableBuff_, CL_TRUE, bufOrigin, hostOrigin, + region, width, 0, width, 0, outputArr_, 0, 0, 0); + CHECK_RESULT(error_, "clEnqueueReadBufferRect failed"); + success_ = (memcmp(inputArr_, outputArr_, bufSize_) == 0); + } +} + +void OCLSDI::testEnqueueMapBuffer(int threadID) { + if (threadID == 0) { + memset(inputArr_, (_openTest + 1), bufSize_); + error_ = _wrapper->clEnqueueWriteBuffer(cmd_queues_[1], extPhysicalBuff_, + CL_TRUE, 0, bufSize_, inputArr_, 0, + 0, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteBuffer failed"); + error_ = _wrapper->clEnqueueWriteSignalAMD(cmd_queues_[1], extPhysicalBuff_, + markerValue_, 0, 0, 0, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteSignalAMD failed"); + error_ = _wrapper->clFinish(cmd_queues_[1]); + CHECK_RESULT(error_, "clFinish failed"); + } else { + error_ = _wrapper->clEnqueueWaitSignalAMD( + cmd_queues_[0], busAddressableBuff_, markerValue_, 0, 0, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWaitSignalAMD failed"); + void* ptr = _wrapper->clEnqueueMapBuffer( + cmd_queues_[0], busAddressableBuff_, CL_TRUE, CL_MAP_READ, 0, bufSize_, + 0, 0, 0, &error_); + CHECK_RESULT(error_, "clEnqueueMapBuffer failed"); + success_ = (memcmp(inputArr_, ptr, bufSize_) == 0); + error_ = _wrapper->clEnqueueUnmapMemObject( + cmd_queues_[0], busAddressableBuff_, ptr, 0, 0, 0); + CHECK_RESULT(error_, "clEnqueueUnmapMemObject failed"); + error_ = _wrapper->clFinish(cmd_queues_[0]); + CHECK_RESULT(error_, "clFinish failed"); + } +} + +void OCLSDI::testEnqueueNDRangeKernel(int threadID) { + if (threadID == 0) { + size_t global_work_size = bufSize_ / sizeof(cl_uint); + error_ = _wrapper->clEnqueueNDRangeKernel(cmd_queues_[1], kernel_, 1, NULL, + &global_work_size, NULL, 0, NULL, + NULL); + CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); + error_ = _wrapper->clFinish(cmd_queues_[1]); + CHECK_RESULT(error_, "clFinish failed"); + error_ = _wrapper->clEnqueueWriteSignalAMD(cmd_queues_[1], extPhysicalBuff_, + markerValue_, 0, 0, 0, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteSignalAMD failed"); + error_ = _wrapper->clFinish(cmd_queues_[1]); + CHECK_RESULT(error_, "clFinish failed"); + } else { + memset(outputArr_, 0, bufSize_); + error_ = _wrapper->clEnqueueWaitSignalAMD( + cmd_queues_[0], busAddressableBuff_, markerValue_, 0, 0, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWaitSignalAMD failed"); + error_ = _wrapper->clEnqueueReadBuffer(cmd_queues_[0], busAddressableBuff_, + CL_TRUE, 0, bufSize_, outputArr_, 0, + 0, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteBuffer failed"); + success_ = true; + for (cl_uint i = 0; i < bufSize_ / sizeof(cl_uint); ++i) { + success_ &= (outputArr_[i] == i + 2); + } + } +} + +void OCLSDI::testEnqueueCopyBuffer(int threadID) { + if (threadID == 0) { + memset(inputArr_, (_openTest + 1), bufSize_); + error_ = _wrapper->clEnqueueWriteBuffer(cmd_queues_[1], srcBuff_, CL_TRUE, + 0, bufSize_, inputArr_, 0, 0, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteBuffer failed"); + error_ = _wrapper->clEnqueueCopyBuffer(cmd_queues_[1], srcBuff_, + extPhysicalBuff_, 0, 0, bufSize_, 0, + NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyBuffer failed"); + error_ = _wrapper->clEnqueueWriteSignalAMD(cmd_queues_[1], extPhysicalBuff_, + markerValue_, 0, 0, 0, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteSignalAMD failed"); + error_ = _wrapper->clFinish(cmd_queues_[1]); + CHECK_RESULT(error_, "clFinish failed"); + } else { + readAndVerifyResult(); + } +} + +void OCLSDI::testEnqueueWriteBuffer(int threadID) { + if (threadID == 0) { + memset(inputArr_, (_openTest + 1), bufSize_); + error_ = _wrapper->clEnqueueWriteBuffer(cmd_queues_[1], extPhysicalBuff_, + CL_TRUE, 0, bufSize_, inputArr_, 0, + 0, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteBuffer failed"); + error_ = _wrapper->clEnqueueWriteSignalAMD(cmd_queues_[1], extPhysicalBuff_, + markerValue_, 0, 0, 0, 0); + CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueWriteSignalAMD failed"); + error_ = _wrapper->clFinish(cmd_queues_[1]); + CHECK_RESULT(error_, "clFinish failed"); + } else { + readAndVerifyResult(); + } +} diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLSDI.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSDI.h new file mode 100644 index 0000000000..cf19d2d014 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSDI.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_OCLSDI_H_ +#define _OCL_OCLSDI_H_ +#include + +#include "OCLTestImp.h" + +class OCLSDI : public OCLTestImp { + public: + OCLSDI(); + virtual ~OCLSDI(); + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + void threadEntry(int threadID); + + private: + void testEnqueueWriteBuffer(int threadID); + void testEnqueueCopyBuffer(int threadID); + void testEnqueueNDRangeKernel(int threadID); + void testEnqueueMapBuffer(int threadID); + void testEnqueueWriteBufferRect(int threadID); + void testEnqueueCopyImageToBuffer(int threadID); + void readAndVerifyResult(); + + bool silentFailure; + cl_context contexts_[2]; + cl_device_id devices_[2]; + cl_command_queue cmd_queues_[2]; + cl_mem extPhysicalBuff_; + cl_mem busAddressableBuff_; + cl_int error_; + cl_bus_address_amd busAddr_; + cl_uint* inputArr_; + cl_uint* outputArr_; + unsigned int bufSize_; + bool success_; + cl_uint markerValue_; + cl_mem srcBuff_; + cl_program program_; + cl_kernel kernel_; + cl_mem image_; + std::string deviceNames_; +}; +#endif // _OCL_OCLSDI_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLSVM.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSVM.cpp new file mode 100644 index 0000000000..cf78a51517 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSVM.cpp @@ -0,0 +1,612 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLSVM.h" + +#include + +#include +#include +#ifdef _WIN32 +#include +#include +#endif +#include + +#define NUM_SIZES 6 + +#define OCL_CHECK(error) \ + if (error != CL_SUCCESS) { \ + fprintf(stderr, "OpenCL API invocation failed at %s:%d\n", __FILE__, \ + __LINE__); \ + exit(-1); \ + } + +#define STR(__macro__) #__macro__ + +#ifdef _WIN32 +size_t getTotalSystemMemory() { + MEMORYSTATUSEX status; + status.dwLength = sizeof(status); + GlobalMemoryStatusEx(&status); + return status.ullTotalPhys; +} +#endif + +template +static unsigned countOf(const T (&)[N]) { + return N; +} + +const static char* sources[] = { + STR(__kernel void test(__global int* ptr) { + ptr[get_global_id(0)] = 0xDEADBEEF; + }), + STR(__kernel void test(__global int* ptr, __global int* ptr2) { + ptr[get_global_id(0)] = 0xDEADBEEF; + ptr2[get_global_id(0)] = 0xDEADF00D; + }), + STR(__kernel void test(__global long* ptr) { + ptr[get_global_id(0) * 1024] = 0xBAADF00D; + }), + STR(__kernel void test(__global ulong* ptr) { + while (ptr) { + *ptr = 0xDEADBEEF; + ptr = *((__global ulong*)(ptr + 1)); + } + }), + STR(__kernel void test(__global volatile int* ptr, int numIterations) { + for (int i = 0; i < numIterations; i++) { + // This should be: + // atomic_fetch_add_explicit(ptr, 1, memory_order_relaxed, + // memory_scope_all_svm_devices); + // But using device atomics is mapped to the same ISA and compiles + // in OpenCL 1.2 + atomic_inc(ptr); + } + }), + STR(__kernel void test(){ + // dummy + }), + STR(__kernel void test(int8 arg0, __global int* arg1, int arg2, + __global int* arg3, __global float* arg4){ + // dummy + }), + STR(__kernel void test(__global int* ptr, int to) { + // dummy kernel that takes a long time to complete + for (int i = 0; i < to; ++i) { + // avoid compiler optimizations + if (ptr[get_global_id(0)] != 17) { + ptr[get_global_id(0)]++; + } else { + ptr[get_global_id(0)] += 2; + } + } + }), + STR(__kernel void test(){ + // dummy + })}; + +OCLSVM::OCLSVM() { _numSubTests = countOf(sources); } + +OCLSVM::~OCLSVM() {} + +void OCLSVM::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_ERROR(error_, "Error opening test"); + _openTest = test; + + if (!isOpenClSvmAvailable(devices_[_deviceId])) { + printf("Device does not support any SVM features, skipping...\n"); + return; + } + + program_ = _wrapper->clCreateProgramWithSource( + context_, 1, sources + _openTest, NULL, &error_); + CHECK_ERROR(error_, "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], + "-cl-std=CL2.0", NULL, NULL); + CHECK_ERROR(error_, "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "test", &error_); + CHECK_ERROR(error_, "clCreateKernel() failed"); +} + +#ifndef CL_VERSION_2_0 +// make sure the tests compile in OpenCL <= 1.2 +void OCLSVM::runFineGrainedBuffer() {} +void OCLSVM::runFineGrainedSystem() {} +void OCLSVM::runFineGrainedSystemLargeAllocations() {} +void OCLSVM::runLinkedListSearchUsingFineGrainedSystem() {} +void OCLSVM::runPlatformAtomics() {} +void OCLSVM::runEnqueueOperations() {} +void OCLSVM::runSvmArgumentsAreRecognized() {} +void OCLSVM::runSvmCommandsExecutedInOrder() {} +void OCLSVM::runIdentifySvmBuffers() {} +#else + +void OCLSVM::runFineGrainedBuffer() { + if (!(svmCaps_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER)) { + printf( + "Device does not support fined-grained buffer sharing, skipping " + "test...\n"); + return; + } + const size_t numElements = 256; + int* ptr = (int*)clSVMAlloc(context_, + CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, + numElements * sizeof(int), 0); + CHECK_RESULT(!ptr, "clSVMAlloc() failed"); + + error_ = clSetKernelArgSVMPointer(kernel_, 0, ptr); + CHECK_ERROR(error_, "clSetKernelArgSVMPointer() failed"); + + size_t gws[1] = {numElements}; + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_ERROR(error_, "clEnqueueNDRangeKernel() failed"); + + error_ = _wrapper->clFinish(cmdQueues_[_deviceId]); + CHECK_ERROR(error_, "Queue::finish() failed"); + + size_t matchingElements = std::count(ptr, ptr + numElements, (int)0xDEADBEEF); + CHECK_RESULT(matchingElements != numElements, "Expected: %zd, found:%zd", + numElements, matchingElements); + clSVMFree(context_, ptr); +} + +void OCLSVM::runFineGrainedSystem() { + if (!(svmCaps_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM)) { + printf( + "Device does not support fined-grained system sharing, skipping " + "test...\n"); + return; + } + + const size_t numElements = 256; + int* ptr = new int[numElements]; + int* ptr2 = new int[numElements]; + error_ = clSetKernelArgSVMPointer(kernel_, 0, ptr); + CHECK_ERROR(error_, "clSetKernelArgSVMPointer() failed"); + + error_ = clSetKernelArgSVMPointer(kernel_, 1, ptr2); + CHECK_ERROR(error_, "clSetKernelArgSVMPointer() failed"); + + size_t gws[1] = {numElements}; + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_ERROR(error_, "clEnqueueNDRangeKernel() failed"); + + error_ = _wrapper->clFinish(cmdQueues_[_deviceId]); + CHECK_ERROR(error_, "Queue::finish() failed"); + + size_t matchingElements = std::count(ptr, ptr + numElements, (int)0xDEADBEEF); + size_t matchingElements2 = + std::count(ptr2, ptr2 + numElements, (int)0xDEADF00D); + CHECK_RESULT(matchingElements + matchingElements2 != 2 * numElements, + "Expected: %zd, found:%zd", numElements * 2, + matchingElements + matchingElements2); + delete[] ptr; + delete[] ptr2; +} + +void OCLSVM::runFineGrainedSystemLargeAllocations() { +#ifdef _WIN32 + if (!(svmCaps_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM)) { + printf( + "Device does not support fined-grained system sharing on Lnx, skipping " + "test...\n"); + return; + } + + // Max allowed multiplier for malloc + size_t allowedMemSize = getTotalSystemMemory() >> 12; + + size_t numElements = 256; + + char* s = getenv("OCLSVM_MALLOC_GB_SIZE"); + char* s2 = getenv("OCLSVM_MEMSET_ALLOC"); + + for (int j = 1; j <= NUM_SIZES; j++) { + numElements = 131072 * j; + + if (s != NULL) numElements = 131072 * atoi(s); + + if (numElements > allowedMemSize) break; + + void* ptr = malloc(numElements * 1024 * sizeof(uint64_t)); + CHECK_ERROR(ptr == NULL, "malloc failure"); + + if (s2 != NULL) memset(ptr, 0, numElements * 1024 * sizeof(uint64_t)); + + error_ = clSetKernelArgSVMPointer(kernel_, 0, ptr); + CHECK_ERROR(error_, "clSetKernelArgSVMPointer() failed"); + + size_t gws[1] = {numElements}; + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_ERROR(error_, "clEnqueueNDRangeKernel() failed"); + + error_ = _wrapper->clFinish(cmdQueues_[_deviceId]); + CHECK_ERROR(error_, "Queue::finish() failed"); + + uint64_t* ptr64 = reinterpret_cast(ptr); + // Do a check + for (int i = 0; i < numElements; i++) { + if ((int)ptr64[i * 1024] != 0xBAADF00D) { + uint64_t temp = ptr64[i * 1024]; + delete[] ptr; + CHECK_RESULT(temp != 0xBAADF00D, "Found: %d, Expected:%d", temp, + 0xBAADF00D); + } + } + delete[] ptr; + } +#endif +} + +void OCLSVM::runLinkedListSearchUsingFineGrainedSystem() { + if (!(svmCaps_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM)) { + printf( + "Device does not support fined-grained system sharing, skipping " + "test...\n"); + return; + } + + uint64_t input[] = {34, 6, 0, 11, 89, 34, 6, 6, 6, 0xDEADBEEF}; + int inputSize = countOf(input); + Node* ptr = NULL; + for (int i = 0; i < inputSize; i++) { + ptr = new Node(input[i], ptr); + } + error_ = clSetKernelArgSVMPointer(kernel_, 0, ptr); + CHECK_ERROR(error_, "clSetKernelArgSVMPointer() failed"); + + size_t gws[1] = {1}; + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_ERROR(error_, "clEnqueueNDRangeKernel() failed"); + + error_ = _wrapper->clFinish(cmdQueues_[_deviceId]); + CHECK_ERROR(error_, "Queue::finish() failed"); + + int matchingElements = 0; + // verify result while deallocating resources at the same time + while (ptr) { + if (ptr->value_ == 0xDEADBEEF) { + matchingElements++; + } + Node* tmp = ptr; + ptr = (Node*)ptr->next_; + delete tmp; + } + CHECK_RESULT(matchingElements != inputSize, "Expected: %d, found:%d", + inputSize, matchingElements); +} + +static int atomicIncrement(volatile int* loc) { +#if defined(_MSC_VER) + return _InterlockedIncrement((volatile long*)loc); +#elif defined(__GNUC__) + return __sync_fetch_and_add(loc, 1); +#endif + printf("Atomic increment not supported, aborting..."); + std::abort(); + return 0; +} + +void OCLSVM::runPlatformAtomics() { + if (!(svmCaps_ & CL_DEVICE_SVM_ATOMICS)) { + printf("SVM atomics not supported, skipping test...\n"); + return; + } + + volatile int* value = (volatile int*)clSVMAlloc( + context_, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(int), + 0); + CHECK_RESULT(!value, "clSVMAlloc() failed"); + *value = 0; + const int numIterations = 1000000; + error_ = clSetKernelArgSVMPointer(kernel_, 0, (const void*)value); + CHECK_ERROR(error_, "clSetKernelArgSVMPointer() failed"); + + error_ = clSetKernelArg(kernel_, 1, sizeof(numIterations), &numIterations); + CHECK_ERROR(error_, "clSetKernelArg() failed"); + + size_t gws[1] = {1}; + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_ERROR(error_, "clEnqueueNDRangeKernel() failed"); + + for (int i = 0; i < numIterations; i++) { + atomicIncrement(value); + } + + error_ = _wrapper->clFinish(cmdQueues_[_deviceId]); + CHECK_ERROR(error_, "Queue::finish() failed"); + + int expected = numIterations * 2; + CHECK_RESULT(*value != expected, "Expected: %d, found:%d", expected, *value); + clSVMFree(context_, (void*)value); +} + +void OCLSVM::runEnqueueOperations() { + size_t numElements = 32; + size_t size = numElements * 4; + int* ptr0 = (int*)clSVMAlloc(context_, 0, size, 0); + CHECK_RESULT(!ptr0, "clSVMAlloc() failed"); + int* ptr1 = (int*)clSVMAlloc(context_, 0, size, 0); + CHECK_RESULT(!ptr1, "clSVMAlloc() failed"); + cl_event userEvent = clCreateUserEvent(context_, &error_); + CHECK_ERROR(error_, "clCreateUserEvent() failed"); + + cl_command_queue queue = cmdQueues_[_deviceId]; + // coarse-grained buffer semantics: the SVM pointer needs to be mapped + // before the pointer can write to it + error_ = + clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_WRITE, ptr0, size, 0, NULL, NULL); + CHECK_ERROR(error_, "clEnqueueSVMMap() failed"); + std::fill(ptr0, ptr0 + numElements, 1); + error_ = clEnqueueSVMUnmap(queue, ptr0, 0, NULL, NULL); + CHECK_ERROR(error_, "clEnqueueSVMUnmap() failed"); + + // we copy the 1st buffer into the 2nd buffer + error_ = clEnqueueSVMMemcpy(queue, true, ptr1, ptr0, size, 0, NULL, NULL); + CHECK_ERROR(error_, "clEnqueueSVMMemcpy() failed"); + + // verification: the 2nd buffer should be identical to the 1st + error_ = clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_READ, ptr1, size, 0, NULL, + &userEvent); + CHECK_ERROR(error_, "clEnqueueSVMMap() failed"); + + error_ = clWaitForEvents(1, &userEvent); + CHECK_ERROR(error_, "clWaitForEvents() failed"); + + size_t observed = std::count(ptr1, ptr1 + numElements, 1); + size_t expected = numElements; + CHECK_RESULT(observed != expected, "Expected: %zd, found:%zd", expected, + observed); + + void* ptrs[2] = {ptr0, ptr1}; + error_ = + clEnqueueSVMFree(queue, countOf(ptrs), ptrs, NULL, NULL, 0, NULL, NULL); + CHECK_ERROR(error_, "clEnqueueSVMFree() failed"); + error_ = clFinish(queue); + CHECK_ERROR(error_, "clFinish() failed"); +} + +/** + * Simple test to ensure that SVM pointer arguments are identified properly in + * the runtime, since kernel arguments of pointer type can be bound to either + * SVM pointers or cl_mem objects. + */ +void OCLSVM::runSvmArgumentsAreRecognized() { + cl_int8 arg0; + error_ = clSetKernelArg(kernel_, 0, sizeof(arg0), &arg0); + CHECK_ERROR(error_, "clSetKernelArg() failed"); + + error_ = clSetKernelArgSVMPointer(kernel_, 1, NULL); + CHECK_ERROR(error_, "clSetKernelArgSVMPointer() failed"); + + cl_int arg2; + error_ = clSetKernelArg(kernel_, 2, sizeof(arg2), &arg2); + CHECK_ERROR(error_, "clSetKernelArg() failed"); + + error_ = clSetKernelArgSVMPointer(kernel_, 3, NULL); + CHECK_ERROR(error_, "clSetKernelArgSVMPointer() failed"); + + cl_mem arg4 = NULL; + error_ = clSetKernelArg(kernel_, 4, sizeof(arg4), &arg4); + CHECK_ERROR(error_, "clSetKernelArg() failed"); + + size_t gws[1] = {1}; + + // run dummy kernel + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_ERROR(error_, "clEnqueueNDRangeKernel() failed"); + error_ = _wrapper->clFinish(cmdQueues_[_deviceId]); + CHECK_ERROR(error_, "Queue::finish() failed"); + + // now we bind a pointer argument to a standard buffer instead of a SVM one + cl_mem buffer = NULL; + error_ = clSetKernelArg(kernel_, 1, sizeof(buffer), &buffer); + CHECK_ERROR(error_, "clSetKernelArg() failed"); + + // re-execute the dummy kernel using different actual parameters + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, gws, NULL, 0, NULL, NULL); + CHECK_ERROR(error_, "clEnqueueNDRangeKernel() failed"); + error_ = _wrapper->clFinish(cmdQueues_[_deviceId]); + CHECK_ERROR(error_, "Queue::finish() failed"); +} + +void OCLSVM::runSvmCommandsExecutedInOrder() { + const int numElements = 100000; + size_t size = numElements * sizeof(int); + // allocate SVM memory + int* data = (int*)clSVMAlloc(context_, CL_MEM_READ_WRITE, size, 0); + CHECK_RESULT(!data, "clSVMAlloc failed"); + + // map the SVM buffer to host + cl_int status = clEnqueueSVMMap(cmdQueues_[_deviceId], CL_TRUE, CL_MAP_WRITE, + data, size, 0, NULL, NULL); + CHECK_ERROR(status, "Error when mapping SVM buffer"); + + // fill buffer with 0s + std::fill(data, data + numElements, 0); + + // unmap the SVM buffer to host + status = clEnqueueSVMUnmap(cmdQueues_[_deviceId], data, 0, NULL, NULL); + CHECK_ERROR(status, "Error when unmapping SVM buffer"); + + // enqueue kernel + status = clSetKernelArgSVMPointer(kernel_, 0, data); + CHECK_ERROR(status, "Error when setting kernel argument"); + status = clSetKernelArg(kernel_, 1, sizeof(int), &numElements); + CHECK_ERROR(status, "clSetKernelArg() failed"); + + cl_event event; + size_t overallSize = (size_t)numElements; + status = clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, NULL, + &overallSize, NULL, 0, NULL, &event); + CHECK_ERROR(status, "Error when enqueuing kernel"); + error_ = clFinish(cmdQueues_[_deviceId]); + CHECK_ERROR(status, "clFinish()"); + + // map the SVM buffer to host + status = clEnqueueSVMMap(cmdQueues_[_deviceId], CL_TRUE, CL_MAP_READ, data, + size, 0, NULL, NULL); + CHECK_ERROR(status, "Error when mapping SVM buffer"); + + bool pass = true; + // verify the data. Using descending order might increase the chance of + // finding an error since the GPU (when used) might not have finished + // updating the data array by the time we do the verification + for (int i = numElements - 1; i >= 0; i--) { + if (data[i] != numElements + 1) { + pass = false; + break; + } + } + + // unmap the SVM buffer to host + status = clEnqueueSVMUnmap(cmdQueues_[_deviceId], data, 0, NULL, NULL); + CHECK_ERROR(status, "Error when unmapping SVM buffer"); + + // free the SVM buffer + status = clEnqueueSVMFree(cmdQueues_[_deviceId], 1, (void**)&data, NULL, NULL, + 0, NULL, NULL); + CHECK_ERROR(status, "Error when freeing the SVM buffer"); + error_ = clFinish(cmdQueues_[_deviceId]); + CHECK_ERROR(error_, "clFinish() failed"); + CHECK_RESULT(!pass, "Wrong result"); +} + +void OCLSVM::runIdentifySvmBuffers() { + size_t size = 1024 * 1024; + + // dummy allocation to force the runtime to track several SVM buffers + clSVMAlloc(context_, CL_MEM_READ_WRITE, size * 10, 0); + + void* ptr = clSVMAlloc(context_, CL_MEM_READ_WRITE, size, 0); + cl_int status; + cl_bool usesSVMpointer = CL_FALSE; + + // dummy allocation to force the runtime to track several SVM buffers + clSVMAlloc(context_, CL_MEM_READ_WRITE, size * 4, 0); + + // buffer using the entire SVM region should be identified as such + cl_mem buf1 = + clCreateBuffer(context_, CL_MEM_USE_HOST_PTR, size, ptr, &status); + CHECK_ERROR(status, "clCreateBuffer failed."); + + size_t paramSize = 0; + status = clGetMemObjectInfo(buf1, CL_MEM_USES_SVM_POINTER, 0, 0, ¶mSize); + CHECK_ERROR(status, "clGetMemObjectInfo failed"); + CHECK_RESULT(paramSize != sizeof(cl_bool), + "clGetMemObjectInfo(CL_MEM_USES_SVM_POINTER) " + "returned wrong size."); + + status = clGetMemObjectInfo(buf1, CL_MEM_USES_SVM_POINTER, sizeof(cl_bool), + &usesSVMpointer, 0); + CHECK_ERROR(status, "clGetMemObjectInfo failed"); + CHECK_RESULT(usesSVMpointer != CL_TRUE, + "clGetMemObjectInfo(CL_MEM_USES_SVM_POINTER) " + "returned CL_FALSE for buffer created from SVM pointer."); + + // Buffer that uses random region within SVM buffers + cl_mem buf2 = clCreateBuffer(context_, CL_MEM_USE_HOST_PTR, 256, + (char*)ptr + size - 256, &status); + CHECK_ERROR(status, "clCreateBuffer failed."); + + status = clGetMemObjectInfo(buf2, CL_MEM_USES_SVM_POINTER, sizeof(cl_bool), + &usesSVMpointer, 0); + CHECK_ERROR(status, "clGetMemObjectInfo failed"); + CHECK_RESULT(usesSVMpointer != CL_TRUE, + "clGetMemObjectInfo(CL_MEM_USES_SVM_POINTER) " + "returned CL_FALSE for buffer created from SVM pointer."); + + // for any other pointer the query should return false + void* randomPtr = malloc(size); + cl_mem buf3 = + clCreateBuffer(context_, CL_MEM_USE_HOST_PTR, size, randomPtr, &status); + CHECK_ERROR(status, "clCreateBuffer failed."); + + status = clGetMemObjectInfo(buf3, CL_MEM_USES_SVM_POINTER, sizeof(cl_bool), + &usesSVMpointer, 0); + CHECK_ERROR(status, "clGetMemObjectInfo failed"); + CHECK_RESULT(usesSVMpointer == CL_TRUE, + "clGetMemObjectInfo(CL_MEM_USES_SVM_POINTER) " + "returned CL_TRUE for buffer not created from SVM pointer."); + + clReleaseMemObject(buf3); + clReleaseMemObject(buf2); + clReleaseMemObject(buf1); + clSVMFree(context_, ptr); +} +#endif + +cl_bool OCLSVM::isOpenClSvmAvailable(cl_device_id device_id) { +#ifdef CL_VERSION_2_0 + error_ = clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_SVM_CAPABILITIES, + sizeof(svmCaps_), &svmCaps_, NULL); + CHECK_ERROR_NO_RETURN(error_, "clGetDeviceInfo() failed"); + if (!(svmCaps_ & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER)) { + return CL_FALSE; + } else { + return CL_TRUE; + } +#endif + // -Device does not support OpenCL >= 2.0 + // -Device supports OpenCL >= 2.0, but available headers are <= 1.2 + return CL_FALSE; +} + +void OCLSVM::run() { + if (!isOpenClSvmAvailable(devices_[_deviceId])) { + printf("Device does not support any SVM features, skipping...\n"); + return; + } + + if (_openTest == 0) { + runFineGrainedBuffer(); + } else if (_openTest == 1) { + runFineGrainedSystem(); + } else if (_openTest == 2) { + runFineGrainedSystemLargeAllocations(); + } else if (_openTest == 3) { + runLinkedListSearchUsingFineGrainedSystem(); + } else if (_openTest == 4) { + runPlatformAtomics(); + } else if (_openTest == 5) { + runEnqueueOperations(); + } else if (_openTest == 6) { + runSvmArgumentsAreRecognized(); + } else if (_openTest == 7) { + runSvmCommandsExecutedInOrder(); + } else if (_openTest == 8) { + runIdentifySvmBuffers(); + } +} + +unsigned int OCLSVM::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLSVM.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSVM.h new file mode 100644 index 0000000000..f861081fed --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSVM.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_SVM_H_ +#define _OCL_SVM_H_ + +#include + +#include "OCLTestImp.h" +#include "stdint.h" + +class OCLSVM : public OCLTestImp { + public: + OCLSVM(); + + virtual ~OCLSVM(); + + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + + virtual void run(void); + + virtual unsigned int close(void); + + private: + void runFineGrainedBuffer(); + void runFineGrainedSystem(); + void runFineGrainedSystemLargeAllocations(); + void runLinkedListSearchUsingFineGrainedSystem(); + void runPlatformAtomics(); + void runEnqueueOperations(); + void runSvmArgumentsAreRecognized(); + void runSvmCommandsExecutedInOrder(); + void runIdentifySvmBuffers(); + cl_bool isOpenClSvmAvailable(cl_device_id device_id); + + uint64_t svmCaps_; +}; + +struct Node { + Node(uint64_t value, Node* next) : value_(value), next_((uint64_t)next) {} + + uint64_t value_; + uint64_t next_; +}; + +#endif // _OCL_SVM_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLSemaphore.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSemaphore.cpp new file mode 100644 index 0000000000..9804633196 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSemaphore.cpp @@ -0,0 +1,225 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLSemaphore.h" + +#include +#include +#include + +#include "CL/cl.h" +#ifndef CL_DEVICE_MAX_SEMAPHORES_AMD +#define CL_DEVICE_MAX_SEMAPHORES_AMD 0x1041 +#else +#error "CL_DEVICE_MAX_SEMAPHORES_AMD is defined somewhere, remove this define!" +#endif +#ifndef CL_DEVICE_MAX_SEMAPHORE_SIZE_AMD +#define CL_DEVICE_MAX_SEMAPHORE_SIZE_AMD 0x1042 +#else +#error \ + "CL_DEVICE_MAX_SEMAPHORE_SIZE_AMD is defined somewhere, remove this define!" +#endif +#ifndef CL_KERNEL_MAX_SEMAPHORE_SIZE_AMD +#define CL_KERNEL_MAX_SEMAPHORE_SIZE_AMD 0x1043 +#else +#error \ + "CL_KERNEL_MAX_SEMAPHORE_SIZE_AMD is defined somewhere, remove this define!" +#endif + +const static unsigned int MaxSemaphores = 1; + +const static char* strKernel = + "#ifdef cl_amd_semaphore\n" + "#pragma OPENCL EXTENSION cl_amd_semaphore : enable \n" + "kernel void sema_test(sema_t lock, global int* a, global int* b, int " + "value)\n" + " {\n" + " size_t idx = get_global_id(0);\n" + " size_t gdx = get_group_id(0);\n" + " size_t ng = get_num_groups(0);\n" + " size_t ssize = get_max_semaphore_size();\n" + " a[1] = true;\n" + " if (gdx >= ssize) {\n" + " return;\n" + " }\n" + " barrier(CLK_GLOBAL_MEM_FENCE);\n" + " semaphore_init(lock, ng);\n" + " while (a[1]) {\n" + " atom_add(a, b[idx]);\n" + " atom_inc(a + 2);\n" + " if (gdx == (ssize - 1)) {\n" + " semaphore_signal(lock);\n" + " if (a[0] >= value) {\n" + " a[1] = false;\n" + " }\n" + " } else {\n" + " semaphore_wait(lock);\n" + " idx += get_global_size(0);\n" + " }\n" + " }\n" + " semaphore_signal(lock);\n" + " }\n" + "#endif\n"; + +OCLSemaphore::OCLSemaphore() { + _numSubTests = 1; + hasSemaphore = false; +} + +OCLSemaphore::~OCLSemaphore() {} + +void OCLSemaphore::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + char name[1024] = {0}; + size_t size = 0; + _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 1024, + name, &size); + if (!strstr(name, "cl_amd_semaphore")) { + error_ = CL_DEVICE_NOT_FOUND; + hasSemaphore = false; + printf("Semaphore extension is required for this test!\n"); + return; + } else { + hasSemaphore = true; + } + _wrapper->clGetDeviceInfo(devices_[deviceId], + (cl_device_info)CL_DEVICE_MAX_SEMAPHORES_AMD, + sizeof(size), &size, NULL); + _wrapper->clGetDeviceInfo(devices_[deviceId], + (cl_device_info)CL_DEVICE_MAX_SEMAPHORE_SIZE_AMD, + sizeof(size), &size, NULL); + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "sema_test", &error_); + _wrapper->clGetKernelInfo(kernel_, + (cl_kernel_info)CL_KERNEL_MAX_SEMAPHORE_SIZE_AMD, + sizeof(size), &size, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + cl_mem buffer; + for (unsigned int i = 0; i < MaxSemaphores; ++i) { + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + sizeof(cl_uint), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + } + + buffer = + _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + 1024 * size * sizeof(cl_uint), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + buffer = + _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, + 1024 * size * sizeof(cl_uint), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLSemaphore::run(void) { + if (!hasSemaphore) { + return; + } + cl_uint initVal[2] = {5, 10}; + + for (unsigned int i = 0; i < MaxSemaphores; ++i) { + cl_mem buffer = buffers()[i]; + error_ = _wrapper->clSetKernelArg(kernel_, i, sizeof(cl_uint), &initVal[i]); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + } + + cl_mem buffer = buffers()[MaxSemaphores]; + error_ = + _wrapper->clSetKernelArg(kernel_, MaxSemaphores, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + buffer = buffers()[MaxSemaphores + 1]; + error_ = _wrapper->clSetKernelArg(kernel_, MaxSemaphores + 1, sizeof(cl_mem), + &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + cl_int val = 64; + error_ = + _wrapper->clSetKernelArg(kernel_, MaxSemaphores + 2, sizeof(val), &val); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + + size_t gws[1] = {64}; + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[0], kernel_, 1, NULL, + gws, NULL, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + + cl_uint outputV[MaxSemaphores] = {0}; + + // Find the new counter value + initVal[0]++; + initVal[1]--; + + for (unsigned int i = 0; i < MaxSemaphores; ++i) { + cl_mem buffer = buffers()[i]; + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[0], buffers()[i], true, 0, + sizeof(cl_uint), &outputV[i], 0, + NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); + if (initVal[i] != outputV[i]) { + printf("%u != %u", initVal[i], outputV[i]); + CHECK_RESULT(true, " - Incorrect result for counter!\n"); + } + } + + // Restore the original value to check the returned result in the kernel + initVal[0]--; + initVal[1]++; + + buffer = buffers()[MaxSemaphores]; + error_ = _wrapper->clEnqueueReadBuffer( + cmdQueues_[0], buffers()[MaxSemaphores], true, 0, + MaxSemaphores * sizeof(cl_uint), outputV, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); + for (unsigned int i = 0; i < MaxSemaphores; ++i) { + if (initVal[i] != outputV[i]) { + printf("%u != %u", initVal[i], outputV[i]); + CHECK_RESULT(true, + " - Incorrect result for counter inside kernel. Returned " + "value != original.\n"); + } + } +} + +unsigned int OCLSemaphore::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLSemaphore.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSemaphore.h new file mode 100644 index 0000000000..9d7aa54dd7 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLSemaphore.h @@ -0,0 +1,39 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_SEMAPHORE_H_ +#define _OCL_SEMAPHORE_H_ + +#include "OCLTestImp.h" + +class OCLSemaphore : public OCLTestImp { + public: + OCLSemaphore(); + virtual ~OCLSemaphore(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + bool hasSemaphore; +}; + +#endif // _OCL_SEMAPHORE_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLStablePState.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLStablePState.cpp new file mode 100644 index 0000000000..98709241d8 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLStablePState.cpp @@ -0,0 +1,129 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLStablePState.h" + +#include +#include +#include + +#include "CL/cl.h" +#include "CL/cl_ext.h" + +cl_device_id gpu_device; + +OCLStablePState::OCLStablePState() { + _numSubTests = 1; + failed_ = false; +} + +OCLStablePState::~OCLStablePState() {} + +void OCLStablePState::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + cl_uint numPlatforms; + cl_platform_id platform = NULL; + cl_uint num_devices = 0; + cl_device_id* devices = NULL; + cl_device_id device = NULL; + _deviceId = deviceId; + + if (type_ != CL_DEVICE_TYPE_GPU) { + error_ = CL_DEVICE_NOT_FOUND; + printf("GPU device is required for this test!\n"); + return; + } + + error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); + if (0 < numPlatforms) { + cl_platform_id* platforms = new cl_platform_id[numPlatforms]; + error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); +#if 0 + // Get last for default + platform = platforms[numPlatforms - 1]; + for (unsigned i = 0; i < numPlatforms; ++i) { +#endif + platform = platforms[_platformIndex]; + char pbuf[100]; + error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], + CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, + NULL); + num_devices = 0; + /* Get the number of requested devices */ + error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, + &num_devices); +#if 0 + } +#endif + delete platforms; + } + /* + * If we could find our platform, use it. If not, die as we need the AMD + * platform for these extensions. + */ + CHECK_RESULT(platform == 0, + "Couldn't find platform with GPU devices, cannot proceed"); + + devices = (cl_device_id*)malloc(num_devices * sizeof(cl_device_id)); + CHECK_RESULT(devices == 0, "no devices"); + + /* Get the requested device */ + error_ = + _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); + + CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); + device = devices[_deviceId]; + gpu_device = device; +} + +static void CL_CALLBACK notify_callback(cl_event event, + cl_int event_command_exec_status, + void* user_data) {} + +void OCLStablePState::run(void) { + if (failed_) { + return; + } + cl_set_device_clock_mode_input_amd setClockModeInput; + setClockModeInput.clock_mode = CL_DEVICE_CLOCK_MODE_PROFILING_AMD; + cl_set_device_clock_mode_output_amd setClockModeOutput = {}; + error_ = _wrapper->clSetDeviceClockModeAMD(gpu_device, setClockModeInput, + &setClockModeOutput); +#ifdef ATI_OS_WIN + CHECK_RESULT(error_ != CL_SUCCESS, "SetClockMode profiling failed\n"); +#else + error_ = CL_SUCCESS; +#endif + + setClockModeInput.clock_mode = CL_DEVICE_CLOCK_MODE_DEFAULT_AMD; + setClockModeOutput = {}; + error_ = _wrapper->clSetDeviceClockModeAMD(gpu_device, setClockModeInput, + &setClockModeOutput); +#ifdef ATI_OS_WIN + CHECK_RESULT(error_ != CL_SUCCESS, "SetClockMode default failed\n"); +#else + error_ = CL_SUCCESS; +#endif +} + +unsigned int OCLStablePState::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLStablePState.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLStablePState.h new file mode 100644 index 0000000000..ec2e6750ff --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLStablePState.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_STABLE_PSTATE_H_ +#define _OCL_STABLE_PSTATE_H_ + +#include "OCLTestImp.h" + +class OCLStablePState : public OCLTestImp { + public: + OCLStablePState(); + virtual ~OCLStablePState(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; +}; + +#endif // _OCL_STABLE_PSTATE_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLThreadTrace.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLThreadTrace.cpp new file mode 100644 index 0000000000..785e27c874 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLThreadTrace.cpp @@ -0,0 +1,344 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLThreadTrace.h" + +#include +#include +#include + +#include "CL/cl.h" + +const static unsigned int IOThreadTrace = 3; // number of input/oputput buffers +static size_t SeNum = 1; // number of SEs +const static unsigned int ttBufSize = 30000; // size of thread trace buffer +const static unsigned int InputElements = 2048; // elements in each vector + +const static char* strKernel = + "__kernel void thread_trace_test( \n" + " __global int *A,__global int *B,__global int *C) \n" + "{ \n" + " int idx = get_global_id(0); \n" + " C[idx] = A[idx] + B[idx]; \n" + "} \n"; + +OCLThreadTrace::OCLThreadTrace() { + _numSubTests = 1; + failed_ = false; + clCreateThreadTraceAMD_ = 0; + clReleaseThreadTraceAMD_ = 0; + clRetainThreadTraceAMD_ = 0; + clGetThreadTraceInfoAMD_ = 0; + clSetThreadTraceParamAMD_ = 0; + clEnqueueThreadTraceCommandAMD_ = 0; + clEnqueueBindThreadTraceBufferAMD_ = 0; + ioBuf_ = 0; + ttBuf_ = 0; +} + +OCLThreadTrace::~OCLThreadTrace() {} + +void OCLThreadTrace::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening"); + + if (deviceId >= deviceCount_) { + failed_ = true; + return; + } + + cl_device_type deviceType; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE, + sizeof(deviceType), &deviceType, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed"); + + if (!(deviceType & CL_DEVICE_TYPE_GPU)) { + printf("GPU device is required for this test!\n"); + failed_ = true; + return; + } + + size_t threadTraceEnabled; + size_t retsize; + error_ = _wrapper->clGetDeviceInfo( + devices_[deviceId], CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD, + sizeof(threadTraceEnabled), &threadTraceEnabled, &retsize); + CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); + + if (!threadTraceEnabled) { + failed_ = true; + testDescString = "Not supported"; + return; + } + + unsigned int datasize = sizeof(unsigned int) * InputElements; + + ioBuf_ = (unsigned int**)malloc(IOThreadTrace * sizeof(unsigned int*)); + CHECK_RESULT((ioBuf_ == NULL), "malloc failed"); + + memset(ioBuf_, 0, IOThreadTrace * sizeof(unsigned int*)); + for (unsigned i = 0; i < IOThreadTrace; ++i) { + ioBuf_[i] = (unsigned int*)malloc(datasize); + CHECK_RESULT((ioBuf_[i] == NULL), "malloc failed"); + for (unsigned j = 0; j < InputElements; ++j) { + ioBuf_[i][j] = j; + } + } + + clCreateThreadTraceAMD_ = + (fnp_clCreateThreadTraceAMD)_wrapper->clGetExtensionFunctionAddress( + "clCreateThreadTraceAMD"); + CHECK_RESULT((clCreateThreadTraceAMD_ == 0), + "clGetExtensionFunctionAddress(clCreateThreadTraceAMD) failed"); + clGetThreadTraceInfoAMD_ = + (fnp_clGetThreadTraceInfoAMD)_wrapper->clGetExtensionFunctionAddress( + "clGetThreadTraceInfoAMD"); + CHECK_RESULT((clGetThreadTraceInfoAMD_ == 0), + "clGetExtensionFunctionAddress(clGetThreadTraceInfoAMD) failed"); + + threadTrace_ = clCreateThreadTraceAMD_(devices_[_deviceId], &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateThreadTraceAMD() failed"); + + // Get number of shader engines + clGetThreadTraceInfoAMD_(threadTrace_, CL_THREAD_TRACE_SE, sizeof(SeNum), + &SeNum, NULL); + + ttBuf_ = (unsigned int**)malloc(SeNum * sizeof(unsigned int*)); + CHECK_RESULT((ttBuf_ == NULL), "malloc failed"); + + memset(ttBuf_, 0, SeNum * sizeof(unsigned int*)); + + program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, + &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed"); + + error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, + NULL, NULL); + if (error_ != CL_SUCCESS) { + char programLog[1024]; + _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], + CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); + printf("\n%s\n", programLog); + fflush(stdout); + } + CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed"); + + kernel_ = _wrapper->clCreateKernel(program_, "thread_trace_test", &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed"); + + cl_mem buffer; + for (unsigned int i = 0; i < IOThreadTrace; ++i) { + buffer = _wrapper->clCreateBuffer(context_, + CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + datasize, ioBuf_[i], &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + } + + for (unsigned int i = 0; i < SeNum; ++i) { + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, ttBufSize, + NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + } + + clReleaseThreadTraceAMD_ = + (fnp_clReleaseThreadTraceAMD)_wrapper->clGetExtensionFunctionAddress( + "clReleaseThreadTraceAMD"); + CHECK_RESULT((clReleaseThreadTraceAMD_ == 0), + "clGetExtensionFunctionAddress(clReleaseThreadTraceAMD) failed"); + clRetainThreadTraceAMD_ = + (fnp_clRetainThreadTraceAMD)_wrapper->clGetExtensionFunctionAddress( + "clRetainThreadTraceAMD"); + CHECK_RESULT((clRetainThreadTraceAMD_ == 0), + "clGetExtensionFunctionAddress(clRetainThreadTraceAMD) failed"); + clSetThreadTraceParamAMD_ = + (fnp_clSetThreadTraceParamAMD)_wrapper->clGetExtensionFunctionAddress( + "clSetThreadTraceParamAMD"); + CHECK_RESULT( + (clSetThreadTraceParamAMD_ == 0), + "clGetExtensionFunctionAddress(clSetThreadTraceParamAMD) failed"); + clEnqueueThreadTraceCommandAMD_ = (fnp_clEnqueueThreadTraceCommandAMD) + _wrapper->clGetExtensionFunctionAddress( + "clEnqueueThreadTraceCommandAMD"); + CHECK_RESULT( + (clEnqueueThreadTraceCommandAMD_ == 0), + "clGetExtensionFunctionAddress(clEnqueueThreadTraceCommandAMD) failed"); + clEnqueueBindThreadTraceBufferAMD_ = + (fnp_clEnqueueBindThreadTraceBufferAMD)_wrapper + ->clGetExtensionFunctionAddress("clEnqueueBindThreadTraceBufferAMD"); + CHECK_RESULT((clEnqueueBindThreadTraceBufferAMD_ == 0), + "clGetExtensionFunctionAddress(" + "clEnqueueBindThreadTraceBufferAMD) failed"); +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +static void DumpTraceSI(unsigned int index, cl_ushort* tracePtr, + size_t numOfBytes) { + FILE* outFile; + char file_name[16] = {0}; + static unsigned int iii = 0; + sprintf(file_name, "TTrace%d%d.out", index, iii++); + + outFile = fopen(file_name, "w"); + + for (size_t i = 0; i < numOfBytes / 2; i++) { + fprintf(outFile, "%04x\n", (cl_ushort)(*tracePtr)); + tracePtr++; + } + + fclose(outFile); +} + +#define DUMPTRACE 0 + +void OCLThreadTrace::run(void) { + cl_mem* ttArrBuf = 0; + unsigned int* ttBufRecordedSizes = 0; + unsigned int i = 0, j = 0; + + if (failed_) { + return; + } + + for (i = 0; i < IOThreadTrace; ++i) { + cl_mem buffer = buffers()[i]; + error_ = _wrapper->clSetKernelArg(kernel_, i, sizeof(cl_mem), &buffer); + CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed"); + } + + size_t globalWorkSize[1]; + size_t localWorkSize[1]; + globalWorkSize[0] = InputElements; + localWorkSize[0] = 32; + + ttArrBuf = (cl_mem*)malloc(sizeof(cl_mem) * SeNum); + ; + for (i = 0; i < SeNum; i++) ttArrBuf[i] = buffers()[IOThreadTrace + i]; + + cl_event clEvent; + error_ = clEnqueueBindThreadTraceBufferAMD_( + cmdQueues_[_deviceId], threadTrace_, ttArrBuf, (cl_uint)SeNum, ttBufSize, + 0, NULL, &clEvent); + CHECK_RESULT((error_ != CL_SUCCESS), + "clEnqueueBindThreadTraceBufferAMD() failed"); + + error_ = clEnqueueThreadTraceCommandAMD_(cmdQueues_[_deviceId], threadTrace_, + CL_THREAD_TRACE_BEGIN_COMMAND, 0, + NULL, &clEvent); + CHECK_RESULT((error_ != CL_SUCCESS), + "clEnqueueThreadTraceCommandAMD() failed"); + + error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1, + NULL, globalWorkSize, localWorkSize, + 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed"); + clFinish(cmdQueues_[_deviceId]); + + error_ = clEnqueueThreadTraceCommandAMD_(cmdQueues_[_deviceId], threadTrace_, + CL_THREAD_TRACE_END_COMMAND, 0, NULL, + &clEvent); + CHECK_RESULT((error_ != CL_SUCCESS), + "clEnqueueThreadTraceCommandAMD() failed"); + + ttBufRecordedSizes = (unsigned int*)malloc(sizeof(unsigned int) * SeNum); + memset(ttBufRecordedSizes, 0, sizeof(unsigned int) * SeNum); + size_t ttBufRecordedSize; + error_ = clGetThreadTraceInfoAMD_(threadTrace_, CL_THREAD_TRACE_BUFFERS_SIZE, + 1, NULL, &ttBufRecordedSize); + CHECK_RESULT((error_ != CL_SUCCESS), "clGetThreadTraceInfoAMD() failed"); + + if (ttBufRecordedSize > sizeof(unsigned int) * SeNum) { + free(ttBufRecordedSizes); + ttBufRecordedSizes = (unsigned int*)malloc(ttBufRecordedSize); + memset(ttBufRecordedSizes, 0, ttBufRecordedSize); + } + + error_ = + clGetThreadTraceInfoAMD_(threadTrace_, CL_THREAD_TRACE_BUFFERS_SIZE, + ttBufRecordedSize, ttBufRecordedSizes, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clGetThreadTraceInfoAMD() failed"); + + for (i = 0; i < SeNum; ++i) { + ttBuf_[i] = (cl_uint*)malloc(ttBufRecordedSizes[i] * sizeof(cl_uint)); + CHECK_RESULT((ttBuf_[i] == NULL), "malloc failed"); + } + + for (i = 0; i < SeNum; ++i) { + if (ttBufRecordedSizes[i] != 0) { + error_ = _wrapper->clEnqueueReadBuffer( + cmdQueues_[_deviceId], buffers()[IOThreadTrace + i], CL_TRUE, 0, + ttBufRecordedSizes[i], ttBuf_[i], 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); +#if DUMPTRACE + DumpTraceSI(i, (cl_ushort*)ttBuf_[i], ttBufRecordedSizes[i]); +#endif + } + } + + bool validRes = true; + for (i = 0; i < SeNum; ++i) { + unsigned j; + for (j = 0; j < ttBufRecordedSizes[i]; ++j) { + if (ttBuf_[i][j] != 0) { + break; + } + } + if (j >= ttBufRecordedSizes[i] && ttBufRecordedSizes[i] > 0) { + validRes = false; + break; + } + } + if (!validRes) { + CHECK_RESULT( + true, + " - Incorrect result for thread trace. no output data was recorded.\n"); + } + + if (ttArrBuf) free(ttArrBuf); + if (ttBufRecordedSizes) free(ttBufRecordedSizes); +} + +unsigned int OCLThreadTrace::close(void) { + if (clReleaseThreadTraceAMD_ && threadTrace_) + clReleaseThreadTraceAMD_(threadTrace_); + + if (ioBuf_) { + for (unsigned i = 0; i < IOThreadTrace; ++i) { + if (ioBuf_[i]) { + free(ioBuf_[i]); + } + } + free(ioBuf_); + } + if (ttBuf_) { + for (unsigned i = 0; i < SeNum; ++i) { + if (ttBuf_[i]) { + free(ttBuf_[i]); + } + } + free(ttBuf_); + } + return OCLTestImp::close(); +} diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLThreadTrace.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLThreadTrace.h new file mode 100644 index 0000000000..6995b499b6 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLThreadTrace.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_THREAD_TRACE_H_ +#define _OCL_THREAD_TRACE_H_ + +#include "OCLTestImp.h" +#include "cl_thread_trace_amd.h" + +// Thread Trace API +typedef CL_API_ENTRY cl_threadtrace_amd( + CL_API_CALL *fnp_clCreateThreadTraceAMD)(cl_device_id, cl_int *); +typedef CL_API_ENTRY cl_int(CL_API_CALL *fnp_clReleaseThreadTraceAMD)( + cl_threadtrace_amd); +typedef CL_API_ENTRY cl_int(CL_API_CALL *fnp_clRetainThreadTraceAMD)( + cl_threadtrace_amd); +typedef CL_API_ENTRY cl_int(CL_API_CALL *fnp_clGetThreadTraceInfoAMD)( + cl_threadtrace_amd, cl_threadtrace_info, size_t, void *, size_t *); +typedef CL_API_ENTRY cl_int(CL_API_CALL *fnp_clSetThreadTraceParamAMD)( + cl_threadtrace_amd, cl_thread_trace_param, cl_uint); +typedef CL_API_ENTRY cl_int(CL_API_CALL *fnp_clEnqueueThreadTraceCommandAMD)( + cl_command_queue, cl_threadtrace_amd, cl_threadtrace_command_name_amd, + cl_uint, const cl_event *, cl_event *); +typedef CL_API_ENTRY cl_int(CL_API_CALL *fnp_clEnqueueBindThreadTraceBufferAMD)( + cl_command_queue, cl_threadtrace_amd, cl_mem *, cl_uint, cl_uint, cl_uint, + const cl_event *, cl_event *); + +class OCLThreadTrace : public OCLTestImp { + public: + OCLThreadTrace(); + virtual ~OCLThreadTrace(); + + public: + virtual void open(unsigned int test, char *units, double &conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; + cl_uint **ioBuf_; + cl_uint **ttBuf_; + cl_threadtrace_amd threadTrace_; + + fnp_clCreateThreadTraceAMD clCreateThreadTraceAMD_; + fnp_clReleaseThreadTraceAMD clReleaseThreadTraceAMD_; + fnp_clRetainThreadTraceAMD clRetainThreadTraceAMD_; + fnp_clGetThreadTraceInfoAMD clGetThreadTraceInfoAMD_; + fnp_clSetThreadTraceParamAMD clSetThreadTraceParamAMD_; + fnp_clEnqueueThreadTraceCommandAMD clEnqueueThreadTraceCommandAMD_; + fnp_clEnqueueBindThreadTraceBufferAMD clEnqueueBindThreadTraceBufferAMD_; +}; + +#endif // _OCL_THREAD_TRACE_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLUnalignedCopy.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLUnalignedCopy.cpp new file mode 100644 index 0000000000..fc7298e087 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLUnalignedCopy.cpp @@ -0,0 +1,127 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLUnalignedCopy.h" + +#include +#include +#include +#include + +#include "CL/cl.h" +#include "CL/cl_ext.h" + +static const int BufSize = 64; + +OCLUnalignedCopy::OCLUnalignedCopy() { + _numSubTests = 1; + failed_ = false; +} + +OCLUnalignedCopy::~OCLUnalignedCopy() {} + +void OCLUnalignedCopy::open(unsigned int test, char* units, double& conversion, + unsigned int deviceId) { + _deviceId = deviceId; + OCLTestImp::open(test, units, conversion, deviceId); + CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test"); + + cl_device_type deviceType; + error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE, + sizeof(deviceType), &deviceType, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed"); + + if (!(deviceType & CL_DEVICE_TYPE_GPU)) { + printf("GPU device is required for this test!\n"); + failed_ = true; + return; + } + cl_mem buffer; + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, + BufSize * sizeof(cl_int4), NULL, &error_); + CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); + buffers_.push_back(buffer); + + buffer = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, + BufSize * sizeof(cl_int4), NULL, &error_); + buffers_.push_back(buffer); +} + +static void CL_CALLBACK notify_callback(const char* errinfo, + const void* private_info, size_t cb, + void* user_data) {} + +void OCLUnalignedCopy::run(void) { + if (failed_) { + return; + } + + char* values = new char[BufSize]; + char* results = new char[BufSize]; + + for (int i = 0; i < BufSize; ++i) { + values[i] = i; + } + + static const char TestCnt = 7; + char sizes[TestCnt][3] = { + {5, 7, 13}, {5, 7, 12}, {4, 9, 12}, {4, 9, 15}, + {27, 16, 15}, {27, 16, 13}, {32, 16, 13}, + }; + + for (int i = 0; i < TestCnt; ++i) { + error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues_[_deviceId], buffers_[0], + CL_FALSE, 0, BufSize, values, 0, + NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed"); + + cl_uint pattern = 0; + error_ = /*_wrapper->*/ clEnqueueFillBuffer( + cmdQueues_[_deviceId], buffers_[1], &pattern, sizeof(pattern), 0, + BufSize, 0, NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueFillBuffer() failed"); + + error_ = _wrapper->clEnqueueCopyBuffer( + cmdQueues_[_deviceId], buffers_[0], buffers_[1], sizes[i][0], + sizes[i][1], sizes[i][2], 0, NULL, NULL); + CHECK_RESULT(error_, "clEnqueueCopyBuffer failed"); + + error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], buffers_[1], + CL_TRUE, 0, BufSize, results, 0, + NULL, NULL); + CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); + + for (int j = 0; j < sizes[i][1]; ++j) { + CHECK_RESULT(results[j] != 0, "Comparison failed"); + } + for (int j = sizes[i][1], k = 0; j < (sizes[i][1] + sizes[i][2]); + ++j, ++k) { + CHECK_RESULT(results[j] != sizes[i][0] + k, "Comparison failed"); + } + for (int j = (sizes[i][1] + sizes[i][2]); j < BufSize; ++j) { + CHECK_RESULT(results[j] != 0, "Comparison failed"); + } + } + + delete[] values; + delete[] results; +} + +unsigned int OCLUnalignedCopy::close(void) { return OCLTestImp::close(); } diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLUnalignedCopy.h b/projects/clr/opencl/tests/ocltst/module/runtime/OCLUnalignedCopy.h new file mode 100644 index 0000000000..18c764af86 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLUnalignedCopy.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef _OCL_UNALIGNED_COPY_H_ +#define _OCL_UNALIGNED_COPY_H_ + +#include "OCLTestImp.h" + +class OCLUnalignedCopy : public OCLTestImp { + public: + OCLUnalignedCopy(); + virtual ~OCLUnalignedCopy(); + + public: + virtual void open(unsigned int test, char* units, double& conversion, + unsigned int deviceID); + virtual void run(void); + virtual unsigned int close(void); + + private: + bool failed_; +}; + +#endif // _OCL_UNALIGNED_COPY_H_ diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/TestList.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/TestList.cpp new file mode 100644 index 0000000000..4d03b22ee8 --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/TestList.cpp @@ -0,0 +1,129 @@ +/* Copyright (c) 2010-present Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "OCLTestListImp.h" + +// +// Includes for tests +// +#include "OCLAsyncMap.h" +#include "OCLAsyncTransfer.h" +#include "OCLAtomicCounter.h" +#include "OCLBlitKernel.h" +#include "OCLBufferFromImage.h" +#include "OCLCPUGuardPages.h" +#include "OCLCreateBuffer.h" +#include "OCLCreateContext.h" +#include "OCLCreateImage.h" +#include "OCLDeviceAtomic.h" +#include "OCLDeviceQueries.h" +#include "OCLDynamic.h" +#include "OCLDynamicBLines.h" +#include "OCLGenericAddressSpace.h" +#include "OCLGetQueueThreadID.h" +#include "OCLGlobalOffset.h" +#include "OCLImage2DFromBuffer.h" +#include "OCLImageCopyPartial.h" +#include "OCLKernelBinary.h" +#include "OCLLDS32K.h" +#include "OCLLinearFilter.h" +#include "OCLLiquidFlash.h" +#include "OCLMapCount.h" +#include "OCLMemDependency.h" +#include "OCLMemObjs.h" +#include "OCLMemoryInfo.h" +#include "OCLMultiQueue.h" +#include "OCLOfflineCompilation.h" +#include "OCLP2PBuffer.h" +#include "OCLPartialWrkgrp.h" +#include "OCLPerfCounters.h" +#include "OCLPersistent.h" +#include "OCLPinnedMemory.h" +#include "OCLPlatformAtomics.h" +#include "OCLProgramScopeVariables.h" +#include "OCLRTQueue.h" +#include "OCLReadWriteImage.h" +#include "OCLSDI.h" +#include "OCLSVM.h" +#include "OCLSemaphore.h" +#include "OCLStablePState.h" +#include "OCLThreadTrace.h" +#include "OCLUnalignedCopy.h" + +// +// Helper macro for adding tests +// +template +static void* dictionary_CreateTestFunc(void) { + return new T(); +} + +#define TEST(name) \ + { #name, &dictionary_CreateTestFunc < name> } + +TestEntry TestList[] = { + TEST(OCLCreateContext), + TEST(OCLAtomicCounter), + TEST(OCLKernelBinary), + TEST(OCLGlobalOffset), + TEST(OCLLinearFilter), + TEST(OCLAsyncTransfer), + TEST(OCLLDS32K), + TEST(OCLMemObjs), + TEST(OCLSemaphore), + TEST(OCLPartialWrkgrp), + TEST(OCLCreateBuffer), + TEST(OCLCreateImage), + TEST(OCLCPUGuardPages), + TEST(OCLMapCount), + TEST(OCLMemoryInfo), + TEST(OCLOfflineCompilation), + TEST(OCLMemDependency), + TEST(OCLGetQueueThreadID), + TEST(OCLDeviceQueries), + TEST(OCLSDI), + TEST(OCLThreadTrace), + TEST(OCLMultiQueue), + TEST(OCLImage2DFromBuffer), + TEST(OCLBufferFromImage), + TEST(OCLPerfCounters), + TEST(OCLSVM), + TEST(OCLProgramScopeVariables), + TEST(OCLGenericAddressSpace), + TEST(OCLDynamic), + TEST(OCLPlatformAtomics), + TEST(OCLDeviceAtomic), + TEST(OCLDynamicBLines), + TEST(OCLUnalignedCopy), + TEST(OCLBlitKernel), + TEST(OCLLiquidFlash), + TEST(OCLRTQueue), + TEST(OCLAsyncMap), + TEST(OCLPinnedMemory), + TEST(OCLReadWriteImage), + TEST(OCLStablePState), + TEST(OCLP2PBuffer), + // Failures in Linux. IOL doesn't support tiling aperture and Cypress linear + // image writes TEST(OCLPersistent), +}; + +unsigned int TestListCount = sizeof(TestList) / sizeof(TestList[0]); +unsigned int TestLibVersion = 0; +const char* TestLibName = "oclruntime"; diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/oclruntime.exclude b/projects/clr/opencl/tests/ocltst/module/runtime/oclruntime.exclude new file mode 100644 index 0000000000..a5807cb63c --- /dev/null +++ b/projects/clr/opencl/tests/ocltst/module/runtime/oclruntime.exclude @@ -0,0 +1,7 @@ +# all clear +OCLImageCopyPartial + +# EPR 362715 +OCLCPUGuardPages + +OCLRegionDeviceQueries