From a456182888fa049d73f0deb5b37fac082b758fbd Mon Sep 17 00:00:00 2001 From: jatang Date: Mon, 5 Jun 2023 18:23:42 -0400 Subject: [PATCH] SWDEV-373396 - OCLCreateBuffer: make the CPU checkResult() 4 times faster Change-Id: If20cd6b509896a748f74ab5296cb85f2c4a9f04c --- opencl/amdocl/cl_memobj.cpp | 4 +-- .../ocltst/module/runtime/OCLCreateBuffer.cpp | 31 ++++++++++++------- .../ocltst/module/runtime/OCLCreateBuffer.h | 8 +++-- 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/opencl/amdocl/cl_memobj.cpp b/opencl/amdocl/cl_memobj.cpp index 7a4851f110..468d79db6c 100644 --- a/opencl/amdocl/cl_memobj.cpp +++ b/opencl/amdocl/cl_memobj.cpp @@ -4240,8 +4240,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueFillBuffer, return CL_INVALID_VALUE; } - // Offset must be a multiple of pattern_size - if ((offset % pattern_size) != 0) { + // Offset and size must be multiple of pattern_size + if (!(amd::isMultipleOf(offset, pattern_size) && amd::isMultipleOf(size, pattern_size))) { return CL_INVALID_VALUE; } diff --git a/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.cpp b/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.cpp index 6120d5b12f..39b034123e 100644 --- a/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.cpp +++ b/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.cpp @@ -62,6 +62,10 @@ void OCLCreateBuffer::open(unsigned int test, char *units, double &conversion, maxSize_ = 1000; #endif // EMU_ENV cl_mem buf = NULL; + + // Make sure to use a size that's multiple of 8 (64bit). + maxSize_ &= 0xFFFFFFFFFFFFFFF8; + buf = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, maxSize_, NULL, &error_); CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed"); @@ -72,7 +76,7 @@ void OCLCreateBuffer::open(unsigned int test, char *units, double &conversion, void OCLCreateBuffer::run(void) { CPerfCounter timer; - cl_uchar pattern = PATTERN; + cl_ulong pattern = PATTERN_20_64BIT; timer.Reset(); timer.Start(); error_ = /*_wrapper->*/ clEnqueueFillBuffer( @@ -90,6 +94,7 @@ void OCLCreateBuffer::run(void) { } #endif void *resultBuf = NULL; + // Reduce the buffer for the step transfers ahead of the allocation, // since huge buffers may cause paging and very low performance maxSteps /= 16; @@ -98,16 +103,14 @@ void OCLCreateBuffer::run(void) { continue; } - checkResult(maxSteps, resultBuf, pattern); + checkResult(maxSteps, resultBuf, PATTERN_20_64BIT); - pattern += 1; - - memset(resultBuf, pattern, maxSteps); + memset(resultBuf, PATTERN_2A_08BIT, maxSteps); writeBuffer(maxSteps, resultBuf); memset(resultBuf, 0x00, maxSteps); - checkResult(maxSteps, resultBuf, pattern); + checkResult(maxSteps, resultBuf, PATTERN_2A_64BIT); free(resultBuf); @@ -129,7 +132,7 @@ void OCLCreateBuffer::run(void) { } void OCLCreateBuffer::checkResult(size_t maxSteps, void *resultBuf, - cl_uchar pattern) { + cl_ulong pattern) { size_t startPoint = 0; while ((startPoint) < maxSize_) { cl_event ee; @@ -142,14 +145,18 @@ void OCLCreateBuffer::checkResult(size_t maxSteps, void *resultBuf, resultBuf, 0, NULL, &ee); CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadBuffer() failed"); _wrapper->clFinish(cmdQueues_[_deviceId]); - size_t cnt = 0; - cl_uchar *cc = (cl_uchar *)resultBuf; - for (size_t i = 0; i < readSize; i++) { + + size_t err_cnt = 0; + size_t chk_cnt = readSize / sizeof(cl_ulong); + + cl_ulong *cc = reinterpret_cast(resultBuf); + + for (size_t i = 0; i < chk_cnt; i++) { if (cc[i] != pattern) { - cnt++; + err_cnt++; } } - if (cnt != 0) { + if (err_cnt != 0) { error_ = -1; CHECK_RESULT((error_ != CL_SUCCESS), "checkResult() failed"); break; diff --git a/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.h b/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.h index d02117c5b8..2c63984096 100644 --- a/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.h +++ b/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.h @@ -22,7 +22,10 @@ #define _OCL_CREATE_BUFFER_H_ #include "OCLTestImp.h" -#define PATTERN 0x20 +#define PATTERN_20_08BIT 0x20 +#define PATTERN_20_64BIT 0x2020202020202020 +#define PATTERN_2A_08BIT 0x2a +#define PATTERN_2A_64BIT 0x2a2a2a2a2a2a2a2a class OCLCreateBuffer : public OCLTestImp { public: @@ -34,8 +37,7 @@ class OCLCreateBuffer : public OCLTestImp { unsigned int deviceID); virtual void run(void); virtual void writeBuffer(size_t tmpMaxSize, void* dataBuf); - virtual void checkResult(size_t tmpMaxSize, void* resultBuf, - cl_uchar pattern); + virtual void checkResult(size_t tmpMaxSize, void* resultBuf, cl_ulong pattern); virtual unsigned int close(void); private: