Files
2025-08-20 19:58:06 +05:30

562 строки
20 KiB
C++

/* Copyright (c) 2010 - 2021 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLGLPerfSepia.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#define WIDTH 1024
#define HEIGHT 1024
// Quiet pesky warnings
#ifdef WIN_OS
#define SNPRINTF sprintf_s
#else
#define SNPRINTF snprintf
#endif
#define MAX(a, b) (a > b ? a : b)
const char* sepiaVertexProgram =
"!!ARBvp1.0\n"
"\n"
"\n"
"OPTION ARB_position_invariant;\n"
"\n"
"PARAM p0 = program.local[2];\n"
"PARAM p1 = program.local[3];\n"
"ATTRIB a0 = vertex.texcoord[0];\n"
"OUTPUT o0 = result.texcoord[0];\n"
"OUTPUT o1 = result.texcoord[1];\n"
"TEMP r0, r1;\n"
"\n"
"MOV o0, a0;\n"
"#SWZ r1, a0, x, y, 0, 0;\n"
"#DPH r0.x, r1, p0;\n"
"#DPH r0.y, r1, p1;\n"
"#MOV o1, r0;\n"
"MOV o1, a0;\n"
"\n"
"END\n";
const char* sepiaFragmentProgram =
"!!ARBfp1.0\n"
"\n"
"\n"
"PARAM p0 = {1e-4, 0.085, 0.0, 0.0};\n"
"PARAM p1 = {0.2125, 0.7154, 0.0721, 0.0};\n"
"PARAM p2 = {-3605.984, 0.1323156, 0.0, -0.1991615};\n"
"PARAM p3 = {708.7939, -0.3903106, -0.05854013, 0.6621023};\n"
"PARAM p4 = {-50.93341, 0.4654831, 1.027555, -0.9069088};\n"
"PARAM p5 = {3.116672, 0.7926372, 0.03219686, 1.411847};\n"
"PARAM p6 = {8.95663e-4, -0.001104567, -6.0827e-4, 0.03277428};\n"
"PARAM p7 = program.local[0];\n"
"PARAM p8 = program.local[1];\n"
"ATTRIB a0 = fragment.texcoord[1];\n"
"OUTPUT o0 = result.color;\n"
"TEMP r0, r1, r2, r3;\n"
"\n"
"TEX r1, a0, texture[0], RECT;\n"
"#MAX r0, p0.x, r1.w;\n"
"#RCP r2, r0.x;\n"
"#DP3 r3, r1, p1;\n"
"#MUL r0, r3, r2;\n"
"#MAD r2, r0, p2, p3;\n"
"#MAD r2, r2, r0, p4;\n"
"#MAD r0, r2, r0, p5;\n"
"#MUL r2, r1.w, p6;\n"
"#MAD r2, r0, r3, r2;\n"
"#MAD r0, r1.w, p0.y, -r3;\n"
"#CMP r2.x, -r0, r2.x, r2.w;\n"
"#MAD r0, r3, r3, -r3;\n"
"#CMP r0, r0.x, r2, r3;\n"
"#MOV r0.w, r1;\n"
"#MUL r0, r0, p7;\n"
"#LRP o0, p8.x, r0, r1;\n"
"MOV o0, r1;\n"
"\n"
"END\n";
const static char* strKernel =
"\n"
"__kernel void program(write_only image2d_t dest, int flipped, int4 dim, "
"float2 st_origin, float4 st_delta, float4 l0, float4 l1, float4 l2, "
"float4 l3, read_only image2d_t t0, sampler_t t_sampler0)\n"
"{\n"
" const sampler_t sam = CLK_NORMALIZED_COORDS_FALSE | "
"CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
"// const float4 p0 = (float4)( 0x1.b33334p-3, 0x1.6e48e8p-1, "
"0x1.275254p-4, 0x0p+0 );\n"
"// const float4 p1 = (float4)( 0x1.a36e2ep-14, 0x1.5c28f6p-4, 0x0p+0, "
"0x0p+0 );\n"
"// const float4 p2 = (float4)( 0x1.d595dap-11, -0x1.218e3cp-10, "
"-0x1.3ee89ep-11, 0x1.0c7ca6p-5 );\n"
"// const float4 p3 = (float4)( -0x1.c2bf7cp+11, 0x1.0efb7cp-3, "
"0x0p+0, -0x1.97e1fcp-3 );\n"
"// const float4 p4 = (float4)( 0x1.62659ep+9, -0x1.8fad94p-2, "
"-0x1.df8f8cp-5, 0x1.52ff12p-1 );\n"
"// const float4 p5 = (float4)( -0x1.9777ap+5, 0x1.dca79ap-2, "
"0x1.070dd8p+0, -0x1.d0565ap-1 );\n"
"// const float4 p6 = (float4)( 0x1.8eef1cp+1, 0x1.95d48cp-1, "
"0x1.07c1b6p-5, 0x1.696ecep+0 );\n"
"// int dest_width = dim.x;\n"
"// int dest_height = dim.y;\n"
" float4 o0, r0, r1, r2, r3, r4;\n"
"// float4 false_vector = (float4) 0.0f;\n"
"// float4 true_vector = (float4) 1.0f;\n"
" int2 loc = (int2)( get_global_id(0), get_global_id(1) );\n"
"// if ((loc.x >= dim.x) || loc.y >= dim.y) return;\n"
"// float4 f0 = (float4)( st_origin.x + ((float)loc.x + 0.5f) * "
"st_delta.x + ((float)loc.y + 0.5f) * st_delta.z, st_origin.y + "
"((float)loc.x + 0.5f) * st_delta.y + ((float)loc.y + 0.5f) * st_delta.w, "
"0.0f, 0.0f );\n"
"// r2 = f0;\n"
"// r0.x = dot(r2.xy,l2.xy) + l2.w;\n"
"// r0.y = dot(r2.xy,l3.xy) + l3.w;\n"
"// r4 = r0;\n"
" r1 = read_imagef(t0, sam/*t_sampler0*/, r4.xy);\n"
"// r3 = dot(r1.xyz,p0.xyz);\n"
"// r2 = max(p1.xxxx, r1.wwww);\n"
"// r0 = native_recip(r2.xxxx);\n"
"// r4 = r3*r0;\n"
"// r2 = r1.wwww*p2;\n"
"// r0 = mad(r4,p3,p4);\n"
"// r0 = mad(r0,r4,p5);\n"
"// r0 = mad(r0,r4,p6);\n"
"// r2 = mad(r0,r3,r2);\n"
"// r0 = mad(r1.wwww,p1.yyyy,-r3);\n"
"// r2.x = select(r2.w,r2.x, isless(-r0.x, 0.0f));\n"
"// r0 = mad(r3,r3,-r3);\n"
"// r0 = select(r3,r2, isless(r0.xxxx, 0.0f));\n"
"// r0.w = r1.w;\n"
"// r0 = r0*l0;\n"
"// r0 = mix(r1,r0, l1.xxxx);\n"
"// r0.xyz = min(r0.xyz, r0.www);\n"
"// o0 = r0;\n"
" write_imagef(dest, loc /*(int2)( loc.x + dim.z , flipped ? "
"get_image_height(dest) - (loc.y + dim.w + 1) : loc.y + dim.w )*/, r1 "
"/*o0*/);\n"
"}\n";
OCLGLPerfSepia::OCLGLPerfSepia() { _numSubTests = 2; }
OCLGLPerfSepia::~OCLGLPerfSepia() {}
void OCLGLPerfSepia::open(unsigned int test, char* units, double& conversion,
unsigned int deviceId) {
bVerify_ = false;
silentFailure_ = false;
iterations_ = 50000;
bpr_ = 0;
data_ = 0;
result_ = 0;
width_ = 0;
height_ = 0;
_crcword = 0;
conversion = 1.0f;
_deviceId = deviceId;
_openTest = test;
texId = 0;
format_.image_channel_order = CL_RGBA;
format_.image_channel_data_type = CL_UNORM_INT8;
srand(0x8956); // some constant instead of time() so that we get same random
// numbers
if (!IsGLEnabled(test, units, conversion, deviceId)) {
silentFailure_ = true;
return;
}
OCLGLCommon::open(test, units, conversion, deviceId);
if (_errorFlag) return;
if (test == 0) {
// Build the kernel
program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed (%d)", error_);
const char* optionsGPU = "-cl-denorms-are-zero -cl-mad-enable";
error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], optionsGPU, NULL, NULL);
if (error_ != CL_SUCCESS) {
char programLog[1024];
_wrapper->clGetProgramBuildInfo(program_, devices_[deviceId], CL_PROGRAM_BUILD_LOG, 1024,
programLog, 0);
printf("\n%s\n", programLog);
fflush(stdout);
}
CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_);
kernel_ = _wrapper->clCreateKernel(program_, "program", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_);
}
}
void OCLGLPerfSepia::populateData(void) {
width_ = WIDTH;
height_ = HEIGHT;
bpr_ = 4 * width_;
data_ = (cl_uchar*)malloc(height_ * bpr_);
for (unsigned int n = 0; n < (height_ * bpr_); n++) {
data_[n] = (n & 3) ? (rand() % 256) : 0xFF;
}
}
void OCLGLPerfSepia::runGL(void) {
glDisable(GL_ALPHA_TEST);
glDisable(GL_DEPTH_TEST);
glDisable(GL_SCISSOR_TEST);
glDisable(GL_BLEND);
glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
glDisable(GL_DITHER);
glDisable(GL_CULL_FACE);
glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
glDepthMask(GL_FALSE);
glStencilMask(0);
glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
// let's create the textures we need
glEnable(GL_TEXTURE_RECTANGLE_EXT);
glGenTextures(1, &texId);
glBindTexture(GL_TEXTURE_RECTANGLE_EXT, texId);
// have GL alloc memory for us for our destination texture which we will be
// rendering into
glTexImage2D(GL_TEXTURE_RECTANGLE_EXT, 0, GL_RGBA, width_, height_, 0, GL_BGRA /*RGBA*/,
GL_UNSIGNED_INT_8_8_8_8_REV, NULL);
glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
// for the source texture we will provide a data ptr and hang on to it
GLuint srcTexture;
glGenTextures(1, &srcTexture);
glBindTexture(GL_TEXTURE_RECTANGLE_EXT, srcTexture);
glPixelStorei(GL_UNPACK_ROW_LENGTH, width_);
glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, height_);
glPixelStorei(GL_UNPACK_ALIGNMENT, 8);
// XXX Alex -- use optimal texture upload format.
glTexImage2D(GL_TEXTURE_RECTANGLE_EXT, 0, GL_RGBA, width_, height_, 0, GL_BGRA, /* GL_RGBA,*/
format_.image_channel_order == CL_RGBA ? GL_UNSIGNED_INT_8_8_8_8
: GL_UNSIGNED_INT_8_8_8_8_REV,
data_);
glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
glPixelStorei(GL_UNPACK_SWAP_BYTES, 0);
glPixelStorei(GL_UNPACK_LSB_FIRST, 0);
glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, 0);
glPixelStorei(GL_UNPACK_SKIP_PIXELS, 0);
glPixelStorei(GL_UNPACK_SKIP_IMAGES, 0);
glPixelStorei(GL_UNPACK_SKIP_ROWS, 0);
glPixelStorei(GL_UNPACK_ALIGNMENT, 4);
GLuint vertexProgram;
GLuint fragmentProgram;
glGenProgramsARB(1, &vertexProgram);
glGenProgramsARB(1, &fragmentProgram);
glBindProgramARB(GL_VERTEX_PROGRAM_ARB, vertexProgram);
glProgramStringARB(GL_VERTEX_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
(GLsizei)strlen(sepiaVertexProgram), sepiaVertexProgram);
glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, fragmentProgram);
glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
(GLsizei)strlen(sepiaFragmentProgram), sepiaFragmentProgram);
GLfloat l0[] = {1.0f, 0.99f, 0.92f, 1.0f};
GLfloat l1[] = {0.5, 0, 0, 0};
GLfloat l2[] = {1, 0, 0, 0};
GLfloat l3[] = {0, -1, 0, (GLfloat)height_};
glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, 0, l0);
glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, 1, l1);
glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, 2, l2);
glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, 3, l3);
glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 0, l0);
glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 1, l1);
glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 2, l2);
glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 3, l3);
GLuint fbo;
glGenFramebuffersEXT(1, &fbo);
glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, fbo);
glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT, GL_TEXTURE_RECTANGLE_ARB,
texId, 0);
glViewport(0, 0, width_, height_);
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
glOrtho(0, width_, 0, height_, -1, 1);
glClearColor(0, 0, 0, 0);
glClear(GL_COLOR_BUFFER_BIT);
glDisable(GL_BLEND);
glEnable(GL_VERTEX_PROGRAM_ARB);
glEnable(GL_FRAGMENT_PROGRAM_ARB);
// warm up
for (unsigned int k = 0; k < (iterations_ / 10); k++) {
glBegin(GL_QUADS);
glTexCoord2f(0, 0);
glVertex2f(0, (GLfloat)height_);
glTexCoord2f((GLfloat)width_, 0);
glVertex2f((GLfloat)width_, (GLfloat)height_);
glTexCoord2f((GLfloat)width_, (GLfloat)height_);
glVertex2f((GLfloat)width_, 0);
glTexCoord2f(0, (GLfloat)height_);
glVertex2f(0, 0);
glEnd();
glFlush();
glFinish();
}
// actual test
for (unsigned int k = 0; k < iterations_; k++) {
if (k == 1) {
timer_.Reset();
timer_.Start();
}
glBegin(GL_QUADS);
glTexCoord2f(0, 0);
glVertex2f(0, (GLfloat)height_);
glTexCoord2f((GLfloat)width_, 0);
glVertex2f((GLfloat)width_, (GLfloat)height_);
glTexCoord2f((GLfloat)width_, (GLfloat)height_);
glVertex2f((GLfloat)width_, 0);
glTexCoord2f(0, (GLfloat)height_);
glVertex2f(0, 0);
glEnd();
}
glFlush();
glFinish();
timer_.Stop();
glDisable(GL_VERTEX_PROGRAM_ARB);
glDisable(GL_FRAGMENT_PROGRAM_ARB);
// now let's read back the pixels
result_ = (cl_uchar*)malloc(width_ * height_ * 4);
glReadPixels(0, 0, width_, height_, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, result_);
// bind back default frame buffer
glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0);
glDeleteFramebuffersEXT(1, &fbo);
glDeleteTextures(1, &srcTexture);
glDeleteProgramsARB(1, &vertexProgram);
glDeleteProgramsARB(1, &fragmentProgram);
}
void OCLGLPerfSepia::runCL(void) {
cl_mem dst, src;
cl_sampler nearestZero;
glEnable(GL_TEXTURE_RECTANGLE_EXT);
glGenTextures(1, &texId);
glBindTexture(GL_TEXTURE_RECTANGLE_EXT, texId);
// XXX Alex: have GL alloc memory for us ...
glTexImage2D(GL_TEXTURE_RECTANGLE_EXT, 0, GL_RGBA, width_, height_, 0, GL_RGBA /*BGRA*/,
GL_UNSIGNED_INT_8_8_8_8_REV, NULL);
dst = _wrapper->clCreateFromGLTexture2D(context_, CL_MEM_READ_WRITE, GL_TEXTURE_RECTANGLE_EXT, 0,
texId, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateFromGLTexture2D error (%d)", error_);
nearestZero =
_wrapper->clCreateSampler(context_, CL_FALSE, CL_ADDRESS_CLAMP, CL_FILTER_NEAREST, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateSampler error (%d)", error_);
src = _wrapper->clCreateImage2D(context_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &format_,
width_, height_, bpr_, data_, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateImage2D error (%d)", error_);
int numArgs = 0;
int dim[2] = {(int)width_, (int)height_};
int flipped[] = {1};
int dims[] = {(int)width_, (int)height_, 0, 0};
float st_origin[] = {0, 0};
float st_delta[] = {1, 0, 0, 1};
_wrapper->clSetKernelArg(kernel_, numArgs++, sizeof(cl_mem),
&dst); // arg is a image2DGL named "dst"
_wrapper->clSetKernelArg(kernel_, numArgs++, sizeof(int),
&flipped); // arg is a int1 named "flipped"
_wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(int),
&dims); // arg is a int4 named "dim"
_wrapper->clSetKernelArg(kernel_, numArgs++, 2 * sizeof(float),
&st_origin); // arg is a float2 named "st_origin"
_wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float),
&st_delta); // arg is a float4 named "st_delta"
float l0[] = {1.0f, 0.99f, 0.92f, 1.0f};
float l1[] = {0.5f, 0.0f, 0.0f, 0.0f};
float l2[] = {1.0f, 0.0f, 0.0f, 0.0f};
float l3[] = {0.0f, -1.0f, 0.0f, (float)height_};
_wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float),
&l0); // arg is a float4 named "l0"
_wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float),
&l1); // arg is a float4 named "l1"
_wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float),
&l2); // arg is a float4 named "l2"
_wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float),
&l3); // arg is a float4 named "l3"
_wrapper->clSetKernelArg(kernel_, numArgs++, sizeof(cl_mem),
&src); // arg is a image2D named "t0"
_wrapper->clSetKernelArg(kernel_, numArgs++, sizeof(cl_sampler),
&nearestZero); // arg is a sampler named "t_sampler0"
size_t execution_threads[2];
size_t execution_local[2];
cl_uint work_dim = 2;
error_ =
_wrapper->clGetKernelWorkGroupInfo(kernel_, devices_[_deviceId], CL_KERNEL_WORK_GROUP_SIZE,
sizeof(execution_local[0]), &execution_local[0], 0);
CHECK_RESULT((error_ != CL_SUCCESS), "clGetKernelWorkGroupInfo error (%d)", error_);
execution_local[1] = 1;
work_dim = 2;
GetKernelExecDimsForImage((unsigned int)execution_local[0], dim[0], dim[1], execution_threads,
execution_local);
result_ = (cl_uchar*)malloc(height_ * bpr_);
const size_t origin[] = {0, 0, 0};
const size_t region[] = {width_, height_, 1};
// warm up
for (unsigned int k = 0; k < (iterations_ / 10); k++) {
error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, work_dim, NULL,
execution_threads, execution_local, 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel error (%d)", error_);
error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
CHECK_RESULT((error_ != CL_SUCCESS), "clFinish error (%d)", error_);
}
// actual test
for (unsigned int k = 0; k < iterations_; k++) {
if (k == 1) {
timer_.Reset();
timer_.Start();
}
error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, work_dim, NULL,
execution_threads, execution_local, 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel error (%d)", error_);
}
error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
CHECK_RESULT((error_ != CL_SUCCESS), "clFinish error (%d)", error_);
timer_.Stop();
error_ = _wrapper->clEnqueueReadImage(cmdQueues_[_deviceId], dst, true, origin, region, bpr_, 0,
result_, 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadImage error (%d)", error_);
_wrapper->clFinish(cmdQueues_[_deviceId]);
_wrapper->clReleaseMemObject(src), src = NULL;
_wrapper->clReleaseSampler(nearestZero);
_wrapper->clReleaseMemObject(dst), dst = NULL;
}
void OCLGLPerfSepia::GetKernelExecDimsForImage(unsigned int work_group_size, unsigned int w,
unsigned int h, size_t* global, size_t* local) {
unsigned int a, b;
static const unsigned int tile_size = 16;
// local[0] and local[1] must be at least 1
local[0] = tile_size < work_group_size ? tile_size : work_group_size;
local[1] =
work_group_size / tile_size > tile_size ? tile_size : MAX(work_group_size / tile_size, 1);
a = w;
b = (unsigned int)local[0];
global[0] = ((a % b) != 0) ? (a / b + 1) : (a / b);
global[0] *= local[0];
a = h;
b = (unsigned int)local[1];
global[1] = ((a % b) != 0) ? (a / b + 1) : (a / b);
global[1] *= local[1];
}
void OCLGLPerfSepia::run(void) {
if (_errorFlag || silentFailure_) {
return;
}
populateData();
if (_openTest == 0) {
runCL();
} else {
runGL();
}
if (bVerify_) {
verifyResult();
}
char buf[100];
SNPRINTF(buf, sizeof(buf), "%s iterations# %d", (_openTest == 0) ? "CL" : "GL", iterations_);
testDescString = buf;
_perfInfo = (float)timer_.GetElapsedTime();
}
void OCLGLPerfSepia::verifyResult(void) {
int r = 0, g = 0, b = 0, a = 0, d = 0;
for (unsigned int k = 0; k < height_ * bpr_; k += 4) {
a = a + result_[k + 0];
r = r + result_[k + 1];
g = g + result_[k + 2];
b = b + result_[k + 3];
}
d = abs(r - 152797810) + abs(g - 125868080) + abs(b - 76147833) + abs(a - 267386880);
CHECK_RESULT(d > 20000, "wrong result");
}
unsigned int OCLGLPerfSepia::close(void) {
if (silentFailure_) {
return 0;
}
if (data_) {
free(data_);
}
if (result_) {
free(result_);
}
if (texId) {
glDeleteTextures(1, &texId);
}
return OCLGLCommon::close();
}