2021-07-08 21:54:02 -07:00
|
|
|
/* Copyright (c) 2010 - 2021 Advanced Micro Devices, Inc.
|
2020-05-29 12:10:04 -04:00
|
|
|
|
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
|
|
|
in the Software without restriction, including without limitation the rights
|
|
|
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
|
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
|
|
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
|
|
|
all copies or substantial portions of the Software.
|
|
|
|
|
|
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
|
|
THE SOFTWARE. */
|
|
|
|
|
|
2021-07-07 18:03:52 -04:00
|
|
|
#include "OCLPerfKernelThroughput.h"
|
|
|
|
|
|
|
|
|
|
#include <assert.h>
|
|
|
|
|
#include <math.h>
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
#include <string.h>
|
|
|
|
|
|
|
|
|
|
#include <sstream>
|
|
|
|
|
|
|
|
|
|
#include "CL/cl.h"
|
|
|
|
|
#include "Timer.h"
|
|
|
|
|
|
|
|
|
|
#define DO_GPU_KERNELS 1
|
|
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
|
#define ENTER(X) printf("Entering %s\n", X);
|
|
|
|
|
#define EXIT(X) printf("Exiting %s\n", X);
|
|
|
|
|
#define PKT(X) X
|
|
|
|
|
#else
|
|
|
|
|
#define ENTER(X)
|
|
|
|
|
#define EXIT(X)
|
|
|
|
|
#define PKT(X)
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// work with multiples of 128
|
|
|
|
|
#define ROUND_MULT(VAL, MULT) ((VAL / MULT) * MULT)
|
|
|
|
|
/*
|
|
|
|
|
int roundUp( int numToRound, int multiple)
|
|
|
|
|
{
|
|
|
|
|
int r = numToRound % multiple;
|
|
|
|
|
if (r == 0)
|
|
|
|
|
{
|
|
|
|
|
return numToRound;
|
|
|
|
|
} else {
|
|
|
|
|
return numToRound + multiple - remainder;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
*/
|
|
|
|
|
// quiety warnings
|
|
|
|
|
#ifdef WIN_OS
|
|
|
|
|
#define SNPRINTF sprintf_s
|
|
|
|
|
#else
|
|
|
|
|
#define SNPRINTF snprintf
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#define WORK_GROUP_SIZE 256
|
|
|
|
|
|
|
|
|
|
/*******************************************************************************
|
|
|
|
|
* Enumerated Types for Tests
|
|
|
|
|
******************************************************************************/
|
|
|
|
|
|
|
|
|
|
// memory operations
|
|
|
|
|
const LARGE_INT numKernelTypes = 2;
|
|
|
|
|
static const char* kernelType[numKernelTypes] = {"MatMul", "Madds"};
|
|
|
|
|
|
|
|
|
|
// source/read memory locations
|
|
|
|
|
const LARGE_INT numMemPaths = 2;
|
|
|
|
|
static const char* memPath[numMemPaths] = {"Host", "Device"};
|
|
|
|
|
|
|
|
|
|
// buffer size
|
|
|
|
|
const LARGE_INT numNumElements = 12; // 15;
|
|
|
|
|
static const LARGE_INT numElements[numNumElements] = {
|
|
|
|
|
4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304,
|
|
|
|
|
16777216 //,
|
|
|
|
|
// 67108864,
|
|
|
|
|
// 268435456
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// flops/byte
|
|
|
|
|
const LARGE_INT numWorkSizes = 5;
|
|
|
|
|
static const LARGE_INT workSize[numWorkSizes] = {1, 4, 16, 64, 256};
|
|
|
|
|
|
|
|
|
|
const float initFloat = 0.001f;
|
|
|
|
|
const float zeroFloat = 0.0f;
|
|
|
|
|
|
|
|
|
|
#define WORK_GROUP_SIZE 256
|
|
|
|
|
|
|
|
|
|
/*******************************************************************************
|
|
|
|
|
* Write the Matrix Multiply Shader Kernel
|
|
|
|
|
******************************************************************************/
|
|
|
|
|
void OCLPerfKernelThroughput::genShaderMatrixMultiply() {
|
|
|
|
|
ENTER("genShaderMatrixMultiply");
|
|
|
|
|
|
|
|
|
|
std::stringstream ss;
|
|
|
|
|
ss.clear();
|
|
|
|
|
#if 0
|
|
|
|
|
printf("%ix%i * %ix%i = %ix%i:\n",
|
|
|
|
|
matrixDim1_, matrixDim2_,
|
|
|
|
|
matrixDim2_, matrixDim1_,
|
|
|
|
|
matrixDim1_, matrixDim1_
|
|
|
|
|
);
|
|
|
|
|
#endif
|
|
|
|
|
ss << "#define BLOCK_SIZE 16\n"
|
|
|
|
|
"#define HA "
|
|
|
|
|
<< matrixDim1_
|
|
|
|
|
<< "\n"
|
|
|
|
|
"#define WA "
|
|
|
|
|
<< matrixDim2_
|
|
|
|
|
<< "\n"
|
|
|
|
|
"#define HB WA\n"
|
|
|
|
|
"#define WB HA\n"
|
|
|
|
|
"#define HC HA\n"
|
|
|
|
|
"#define WC WB\n"
|
|
|
|
|
"__kernel void\n"
|
|
|
|
|
"__attribute__((reqd_work_group_size(16,16,1)))\n"
|
|
|
|
|
"kernel1(\n"
|
|
|
|
|
" __global float * restrict C,\n"
|
|
|
|
|
" __global float * restrict A,\n"
|
|
|
|
|
" __global float * restrict B )\n"
|
|
|
|
|
"{\n"
|
|
|
|
|
" int bx = get_group_id(0);\n"
|
|
|
|
|
" int by = get_group_id(1);\n"
|
|
|
|
|
" int tx = get_local_id(0);\n"
|
|
|
|
|
" int ty = get_local_id(1);\n"
|
|
|
|
|
" int aBegin = WA * BLOCK_SIZE * by;\n"
|
|
|
|
|
" int aEnd = aBegin + WA - 1;\n"
|
|
|
|
|
" int aStep = BLOCK_SIZE;\n"
|
|
|
|
|
" int bBegin = BLOCK_SIZE * bx;\n"
|
|
|
|
|
" int bStep = BLOCK_SIZE * WB;\n"
|
|
|
|
|
" __private float c = 0.f;\n"
|
|
|
|
|
" __local float localA[BLOCK_SIZE][BLOCK_SIZE];\n"
|
|
|
|
|
" __local float localB[BLOCK_SIZE][BLOCK_SIZE];\n"
|
|
|
|
|
" for (\n"
|
|
|
|
|
" int a = aBegin, b = bBegin;\n"
|
|
|
|
|
" a <= aEnd;\n"
|
|
|
|
|
" a += aStep, b += bStep)\n"
|
|
|
|
|
" {\n"
|
|
|
|
|
" localA[ty][tx] = (get_global_id(0) < WA && get_global_id(1) < "
|
|
|
|
|
"HA) ? A[a + WA * ty + tx] : 0;\n"
|
|
|
|
|
" localB[ty][tx] = (get_global_id(0) < WB && get_global_id(1) < "
|
|
|
|
|
"HB) ? B[b + WB * ty + tx] : 0;\n"
|
|
|
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
|
|
|
" for (int k = 0; k < BLOCK_SIZE; ++k)\n"
|
|
|
|
|
" c += localA[ty][k] * localB[k][tx];\n"
|
|
|
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
|
|
|
" }\n"
|
|
|
|
|
" int cIdx = WB * BLOCK_SIZE * by + BLOCK_SIZE * bx + WB * ty + tx;\n"
|
|
|
|
|
" if (get_global_id(0) < WC && get_global_id(1) < WC)\n"
|
|
|
|
|
" {\n"
|
|
|
|
|
" C[cIdx] = c;\n"
|
|
|
|
|
" }\n"
|
|
|
|
|
"}\n";
|
|
|
|
|
|
|
|
|
|
shader_ = ss.str();
|
|
|
|
|
gold_ = 0.f;
|
|
|
|
|
for (int i = 0; i < matrixDim2_; i++) gold_ += initFloat * initFloat;
|
|
|
|
|
// gold_ = initFloat * initFloat * matrixDim2_;
|
|
|
|
|
// printf("shader:\n%s\n", shader_.c_str());
|
|
|
|
|
// printf("gold_: %f\n", gold_);
|
|
|
|
|
EXIT("genShaderMatrixMultiply");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*******************************************************************************
|
|
|
|
|
* Write the Madds Shader Kernel
|
|
|
|
|
******************************************************************************/
|
|
|
|
|
void OCLPerfKernelThroughput::genShaderMadds() {
|
|
|
|
|
ENTER("genShaderMadds");
|
|
|
|
|
|
|
|
|
|
int flopLoopIter = 2 * (flopsPerByte_ * 4 * 4) / 16; // bytes, flops
|
|
|
|
|
|
|
|
|
|
std::stringstream ss;
|
|
|
|
|
ss.clear();
|
|
|
|
|
float a, b;
|
|
|
|
|
|
|
|
|
|
ss << // begin kernel
|
|
|
|
|
"__kernel void\n"
|
|
|
|
|
"__attribute__((reqd_work_group_size("
|
|
|
|
|
<< 256
|
|
|
|
|
<< ",1,1)))\n"
|
|
|
|
|
"kernel1(\n"
|
|
|
|
|
" __global float4 * restrict input,\n"
|
|
|
|
|
" __global float4 * restrict output )\n"
|
|
|
|
|
"{\n";
|
|
|
|
|
|
|
|
|
|
// begin loop
|
|
|
|
|
ss << " for ( uint idx = get_global_id(0);\n"
|
|
|
|
|
" idx < "
|
|
|
|
|
<< numElements[numElementsIdx_]
|
|
|
|
|
<< ";\n"
|
|
|
|
|
" idx += get_global_size(0) )\n"
|
|
|
|
|
" {\n";
|
|
|
|
|
|
|
|
|
|
// do load
|
|
|
|
|
ss << " float4 prefetch = input[ idx ];\n"
|
|
|
|
|
" float a0 = prefetch.x;\n"
|
|
|
|
|
" float a1 = prefetch.y;\n"
|
|
|
|
|
" float a2 = prefetch.z;\n"
|
|
|
|
|
" float a3 = prefetch.w;\n"
|
|
|
|
|
" float b0 = a0;\n"
|
|
|
|
|
" float b1 = a1;\n"
|
|
|
|
|
" float b2 = a2;\n"
|
|
|
|
|
" float b3 = a3;\n";
|
|
|
|
|
a = initFloat;
|
|
|
|
|
b = a;
|
|
|
|
|
|
|
|
|
|
// do math
|
|
|
|
|
for (int i = 0; i < flopLoopIter; i++) {
|
|
|
|
|
ss << " a0 += b3*b1;\n"
|
|
|
|
|
" a1 += b0*b2;\n"
|
|
|
|
|
" a2 += b1*b3;\n"
|
|
|
|
|
" a3 += b2*b0;\n"
|
|
|
|
|
" b0 += a3*a1;\n"
|
|
|
|
|
" b1 += a0*a2;\n"
|
|
|
|
|
" b2 += a1*a3;\n"
|
|
|
|
|
" b3 += a2*a0;\n";
|
|
|
|
|
// printf("a += b*b; %f += %f*%f\n", a, b, b);
|
|
|
|
|
a += b * b;
|
|
|
|
|
// printf("b += a*a; %f += %f*%f\n", b, a, a);
|
|
|
|
|
b += a * a;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// do write or accumulate
|
|
|
|
|
ss << " __private float4 tmp;\n"
|
|
|
|
|
" tmp.x = b0;\n"
|
|
|
|
|
" tmp.y = b1;\n"
|
|
|
|
|
" tmp.z = b2;\n"
|
|
|
|
|
" tmp.w = b3;\n"
|
|
|
|
|
" output[ idx ] = tmp;\n";
|
|
|
|
|
gold_ = b;
|
|
|
|
|
// printf("GPU gold_ Tmp: %f\n", gold_);
|
|
|
|
|
|
|
|
|
|
// end loop
|
|
|
|
|
ss << " } // end loop\n";
|
|
|
|
|
// end kernel
|
|
|
|
|
ss << " } // end kernel\n\n";
|
|
|
|
|
|
|
|
|
|
shader_ = ss.str();
|
|
|
|
|
// printf("shader:\n%s\n", shader_.c_str());
|
|
|
|
|
// printf("gold_: %f\n", gold_);
|
|
|
|
|
EXIT("genShaderMadds");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void CL_CALLBACK notify_callback(const char* errinfo, const void* private_info, size_t cb,
|
|
|
|
|
void* user_data) {}
|
|
|
|
|
|
|
|
|
|
/*******************************************************************************
|
|
|
|
|
* Constructor
|
|
|
|
|
******************************************************************************/
|
|
|
|
|
OCLPerfKernelThroughput::OCLPerfKernelThroughput() {
|
|
|
|
|
ENTER("constructor");
|
|
|
|
|
_numSubTests = numKernelTypes * numMemPaths * numNumElements * numWorkSizes;
|
|
|
|
|
|
|
|
|
|
cl_uint numPlatforms;
|
|
|
|
|
cl_platform_id platform = NULL;
|
|
|
|
|
cl_uint num_devices = 0;
|
|
|
|
|
cl_device_id* devices = NULL;
|
|
|
|
|
cl_device_id device = NULL;
|
|
|
|
|
context_ = 0;
|
|
|
|
|
|
|
|
|
|
error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
|
|
|
|
|
if (0 < numPlatforms) {
|
|
|
|
|
cl_platform_id* platforms = new cl_platform_id[numPlatforms];
|
|
|
|
|
error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
|
|
|
|
|
// Get last for default
|
|
|
|
|
platform = platforms[numPlatforms - 1];
|
|
|
|
|
for (unsigned i = 0; i < numPlatforms; ++i) {
|
|
|
|
|
char pbuf[100];
|
|
|
|
|
error_ =
|
|
|
|
|
_wrapper->clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL);
|
|
|
|
|
num_devices = 0;
|
|
|
|
|
/* Get the number of requested devices */
|
|
|
|
|
error_ = _wrapper->clGetDeviceIDs(platforms[i], type_, 0, NULL, &num_devices);
|
|
|
|
|
// Runtime returns an error when no GPU devices are present
|
|
|
|
|
// instead of just returning 0 devices
|
|
|
|
|
// CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
|
|
|
|
|
// Choose platform with GPU devices
|
|
|
|
|
if (num_devices > 0) {
|
|
|
|
|
// printf("NumDevices: %i\n", num_devices);
|
|
|
|
|
platform = platforms[i];
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
delete platforms;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If we could find our platform, use it, else die.
|
|
|
|
|
*/
|
|
|
|
|
CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
|
|
|
|
|
|
|
|
|
|
devices = (cl_device_id*)malloc(num_devices * sizeof(cl_device_id));
|
|
|
|
|
CHECK_RESULT(devices == 0, "no devices");
|
|
|
|
|
|
|
|
|
|
/* Get the requested device */
|
|
|
|
|
error_ = _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
|
|
|
|
|
|
|
|
|
|
CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
|
|
|
|
|
device = devices[_deviceId];
|
|
|
|
|
|
|
|
|
|
// get gpu speed
|
|
|
|
|
error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY,
|
|
|
|
|
sizeof(maxClockFrequency_), &maxClockFrequency_, NULL);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
|
|
|
|
|
error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(maxComputeUnits_),
|
|
|
|
|
&maxComputeUnits_, NULL);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
|
|
|
|
|
if (maxComputeUnits_ > 8) {
|
|
|
|
|
// printf("%i CUs reported; assuming 8 instead.", maxComputeUnits_);
|
|
|
|
|
maxComputeUnits_ = 8;
|
|
|
|
|
}
|
|
|
|
|
// printf("Compute Units: %i\n", maxComputeUnits_);
|
|
|
|
|
|
|
|
|
|
// printf("Subtests: %i\n", _numSubTests);
|
|
|
|
|
|
|
|
|
|
// create context
|
|
|
|
|
context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, &error_);
|
|
|
|
|
CHECK_RESULT(context_ == 0, "clCreateContext failed");
|
|
|
|
|
|
|
|
|
|
char charbuf[1024];
|
|
|
|
|
size_t retsize;
|
|
|
|
|
error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024, charbuf, &retsize);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
|
|
|
|
|
|
|
|
|
|
if (context_) {
|
|
|
|
|
error_ = _wrapper->clReleaseContext(context_);
|
|
|
|
|
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
cl_uint tmp;
|
|
|
|
|
error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(tmp), &tmp, NULL);
|
|
|
|
|
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
|
|
|
|
|
// printf("NumComputeUnits: %u\n", tmp);
|
|
|
|
|
maxComputeUnits_ = static_cast<LARGE_INT>(tmp);
|
|
|
|
|
// printf("NumComputeUnits: %lld\n", maxComputeUnits_);
|
|
|
|
|
EXIT("constructor");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
OCLPerfKernelThroughput::~OCLPerfKernelThroughput() {}
|
|
|
|
|
|
|
|
|
|
/*******************************************************************************
|
|
|
|
|
* Open - initializes test, compile GPU kernel
|
|
|
|
|
******************************************************************************/
|
|
|
|
|
void OCLPerfKernelThroughput::open(unsigned int test, char* units, double& conversion,
|
|
|
|
|
unsigned int deviceId) {
|
|
|
|
|
ENTER("open");
|
|
|
|
|
/***********************************************************
|
|
|
|
|
* select subtest
|
|
|
|
|
**********************************************************/
|
|
|
|
|
int testIdx = test + numKernelTypes * numMemPaths * numNumElements * numWorkSizes;
|
|
|
|
|
memPathIdx_ = testIdx % numMemPaths;
|
|
|
|
|
testIdx /= numMemPaths;
|
|
|
|
|
numElementsIdx_ = testIdx % numNumElements;
|
|
|
|
|
testIdx /= numNumElements;
|
|
|
|
|
workSizeIdx_ = testIdx % numWorkSizes;
|
|
|
|
|
testIdx /= numWorkSizes;
|
|
|
|
|
kernelTypeIdx_ = testIdx % numKernelTypes;
|
|
|
|
|
testIdx /= numKernelTypes;
|
|
|
|
|
|
|
|
|
|
// float md1;
|
|
|
|
|
|
|
|
|
|
// kernel values
|
|
|
|
|
switch (kernelTypeIdx_) {
|
|
|
|
|
case 0: // Matrix Multiply
|
|
|
|
|
// md1 = sqrt(1.f*numElements[numElementsIdx_]);
|
|
|
|
|
// printf("MD1: sqrt(%f) = %f\n", 1.f*numElements[numElementsIdx_],md1);
|
|
|
|
|
matrixDim1_ = static_cast<int>(sqrt(1.f * numElements[numElementsIdx_]));
|
|
|
|
|
matrixDim2_ = matrixDim1_ * (int)workSize[workSizeIdx_];
|
|
|
|
|
genShaderMatrixMultiply();
|
|
|
|
|
work_dim_ = 2;
|
|
|
|
|
global_work_size_ = new size_t[work_dim_];
|
|
|
|
|
global_work_size_[0] =
|
|
|
|
|
((matrixDim1_ - 1) / 16 + 1) * 16; // matrixDim1_ < 16 ? 16 : matrixDim1_;
|
|
|
|
|
global_work_size_[1] = global_work_size_[0];
|
|
|
|
|
local_work_size_ = new size_t[work_dim_];
|
|
|
|
|
local_work_size_[0] = 16;
|
|
|
|
|
local_work_size_[1] = local_work_size_[0];
|
|
|
|
|
/*
|
|
|
|
|
printf("Global: %ix%i; Local: %ix%i; Matrix: %ix%i\n",
|
|
|
|
|
global_work_size_[0],
|
|
|
|
|
global_work_size_[1],
|
|
|
|
|
local_work_size_[0],
|
|
|
|
|
local_work_size_[1],
|
|
|
|
|
matrixDim1_,
|
|
|
|
|
matrixDim2_
|
|
|
|
|
);
|
|
|
|
|
*/
|
|
|
|
|
input1BufferSize_ = static_cast<size_t>(matrixDim1_ * matrixDim2_ * sizeof(float));
|
|
|
|
|
input2BufferSize_ = static_cast<size_t>(matrixDim2_ * matrixDim1_ * sizeof(float));
|
|
|
|
|
output1BufferSize_ = static_cast<size_t>(matrixDim1_ * matrixDim1_ * sizeof(float));
|
|
|
|
|
_reqDataSize = (1.0 * matrixDim1_ * matrixDim2_ * sizeof(float)) +
|
|
|
|
|
(1.0 * matrixDim2_ * matrixDim1_ * sizeof(float)) +
|
|
|
|
|
(1.0 * matrixDim1_ * matrixDim1_ * sizeof(float));
|
|
|
|
|
break;
|
|
|
|
|
case 1: // Flops/Byte
|
|
|
|
|
flopsPerByte_ = (int)workSize[workSizeIdx_]; // for kernelType == 0
|
|
|
|
|
genShaderMadds();
|
|
|
|
|
numWorkGroupsPerComputeUnit_ = 32; // TODO
|
|
|
|
|
numThreads_ = numWorkGroupsPerComputeUnit_ * maxComputeUnits_ * WORK_GROUP_SIZE;
|
|
|
|
|
work_dim_ = 1;
|
|
|
|
|
global_work_size_ = new size_t[work_dim_];
|
|
|
|
|
local_work_size_ = new size_t[work_dim_];
|
|
|
|
|
global_work_size_[0] = numThreads_;
|
|
|
|
|
local_work_size_[0] = WORK_GROUP_SIZE;
|
|
|
|
|
input1BufferSize_ = static_cast<size_t>(numElements[numElementsIdx_] * sizeof(float4));
|
|
|
|
|
input2BufferSize_ = 0;
|
|
|
|
|
output1BufferSize_ = static_cast<size_t>(numElements[numElementsIdx_] * sizeof(float4));
|
|
|
|
|
_reqDataSize = 2.0 * numElements[numElementsIdx_] * sizeof(float4);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
PKT(printf("Test Parameters:\n"
|
|
|
|
|
"\tkernelTypeIdx: %i\n"
|
|
|
|
|
"\tmemPathIdx: %i\n"
|
|
|
|
|
"\tnumElementsIdx: %i\n"
|
|
|
|
|
"\tworkSizeIdx: %i\n"
|
|
|
|
|
"\n\n",
|
|
|
|
|
kernelTypeIdx_, memPathIdx_, numElementsIdx_, workSizeIdx_);)
|
|
|
|
|
|
|
|
|
|
/***********************************************************
|
|
|
|
|
* get context and queue
|
|
|
|
|
**********************************************************/
|
|
|
|
|
cl_uint numPlatforms;
|
|
|
|
|
cl_platform_id platform = NULL;
|
|
|
|
|
cl_uint num_devices = 0;
|
|
|
|
|
cl_device_id* devices = NULL;
|
|
|
|
|
cl_device_id device = NULL;
|
|
|
|
|
_crcword = 0;
|
|
|
|
|
conversion = 1.0;
|
|
|
|
|
_deviceId = deviceId;
|
|
|
|
|
|
|
|
|
|
context_ = 0;
|
|
|
|
|
cmd_queue_ = 0;
|
|
|
|
|
program_ = 0;
|
|
|
|
|
kernel_ = 0;
|
|
|
|
|
input1Buffer_ = 0;
|
|
|
|
|
output1Buffer_ = 0;
|
|
|
|
|
_errorFlag = false; // Reset error code so a single error
|
|
|
|
|
// doesn't prevent other subtests from running
|
|
|
|
|
_errorMsg = "";
|
|
|
|
|
|
|
|
|
|
error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
|
|
|
|
|
if (0 < numPlatforms) {
|
|
|
|
|
cl_platform_id* platforms = new cl_platform_id[numPlatforms];
|
|
|
|
|
error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
|
|
|
|
|
|
|
|
|
|
platform = platforms[_platformIndex];
|
|
|
|
|
char pbuf[100];
|
|
|
|
|
error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], CL_PLATFORM_VENDOR,
|
|
|
|
|
sizeof(pbuf), pbuf, NULL);
|
|
|
|
|
num_devices = 0;
|
|
|
|
|
/* Get the number of requested devices */
|
|
|
|
|
error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, &num_devices);
|
|
|
|
|
// Runtime returns an error when no GPU devices are present
|
|
|
|
|
// instead of just returning 0 devices
|
|
|
|
|
// CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
|
|
|
|
|
delete platforms;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If we could find our platform, use it, else die.
|
|
|
|
|
*/
|
|
|
|
|
CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
|
|
|
|
|
|
|
|
|
|
devices = (cl_device_id*)malloc(num_devices * sizeof(cl_device_id));
|
|
|
|
|
CHECK_RESULT(devices == 0, "no devices");
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Get the requested device
|
|
|
|
|
*/
|
|
|
|
|
error_ = _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
|
|
|
|
|
|
|
|
|
|
device = devices[0];
|
|
|
|
|
|
|
|
|
|
context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, &error_);
|
|
|
|
|
CHECK_RESULT(context_ == 0, "clCreateContext failed");
|
|
|
|
|
|
|
|
|
|
cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, CL_QUEUE_PROFILING_ENABLE, NULL);
|
|
|
|
|
CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
|
|
|
|
|
|
|
|
|
|
// Global memory size
|
|
|
|
|
cl_ulong _maxMemoryAllocationSize;
|
|
|
|
|
error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong),
|
|
|
|
|
&_maxMemoryAllocationSize, NULL);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs(CL_DEVICE_GLOBAL_MEM_SIZE) failed");
|
|
|
|
|
#if 0
|
|
|
|
|
printf("Buffer Sizes: %i %i %i = %f\n",
|
|
|
|
|
input1BufferSize_,
|
|
|
|
|
input2BufferSize_,
|
|
|
|
|
output1BufferSize_,
|
|
|
|
|
_reqDataSize);
|
|
|
|
|
#endif
|
|
|
|
|
_dataSizeTooBig = (_reqDataSize > _maxMemoryAllocationSize);
|
|
|
|
|
if (_dataSizeTooBig) {
|
|
|
|
|
// printf("DATA TOO LARGE FOR DEVICE !!!");
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// create kernel
|
|
|
|
|
char* tmp = (char*)shader_.c_str();
|
|
|
|
|
program_ = _wrapper->clCreateProgramWithSource(context_, 1, (const char**)&tmp, NULL, &error_);
|
|
|
|
|
CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
|
|
|
|
|
|
|
|
|
|
std::string args;
|
|
|
|
|
args.clear();
|
|
|
|
|
error_ = _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL);
|
|
|
|
|
if (error_ != CL_SUCCESS) {
|
|
|
|
|
cl_int intError;
|
|
|
|
|
char log[16384];
|
|
|
|
|
intError = _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
|
|
|
|
|
16384 * sizeof(char), log, NULL);
|
|
|
|
|
printf("Build error -> %s\n", log);
|
|
|
|
|
CHECK_RESULT(0, "clBuildProgram failed");
|
|
|
|
|
}
|
|
|
|
|
kernel_ = _wrapper->clCreateKernel(program_, "kernel1", &error_);
|
|
|
|
|
CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
|
|
|
|
|
|
|
|
|
|
/***********************************************************
|
|
|
|
|
* Allocate GPU Memory
|
|
|
|
|
**********************************************************/
|
|
|
|
|
cl_mem_flags inputBufferFlags = 0;
|
|
|
|
|
cl_mem_flags outputBufferFlags = 0;
|
|
|
|
|
|
|
|
|
|
// choose gpu source buffer type
|
|
|
|
|
switch (memPathIdx_) {
|
|
|
|
|
case 0: // host memory
|
|
|
|
|
// printf("Allocating Host Memories\n");
|
|
|
|
|
// allocate "device" memory
|
|
|
|
|
inputBufferFlags = CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
|
|
|
|
|
outputBufferFlags = CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
|
|
|
|
|
input1Buffer_ =
|
|
|
|
|
_wrapper->clCreateBuffer(context_, inputBufferFlags, input1BufferSize_, NULL, &error_);
|
|
|
|
|
CHECK_RESULT(input1Buffer_ == 0, "clCreateBuffer Input failed");
|
|
|
|
|
if (input1Buffer_ == 0) printf("Error: %i\n", error_);
|
|
|
|
|
if (input2BufferSize_) {
|
|
|
|
|
input2Buffer_ =
|
|
|
|
|
_wrapper->clCreateBuffer(context_, inputBufferFlags, input2BufferSize_, NULL, &error_);
|
|
|
|
|
CHECK_RESULT(input2Buffer_ == 0, "clCreateBuffer Input failed");
|
|
|
|
|
}
|
|
|
|
|
output1Buffer_ =
|
|
|
|
|
_wrapper->clCreateBuffer(context_, outputBufferFlags, output1BufferSize_, NULL, &error_);
|
|
|
|
|
CHECK_RESULT(output1Buffer_ == 0, "clCreateBuffer Input failed");
|
|
|
|
|
if (output1Buffer_ == 0) printf("Error: %i\n", error_);
|
|
|
|
|
|
|
|
|
|
// map host memory
|
|
|
|
|
input1Ptr_ =
|
|
|
|
|
(float*)_wrapper->clEnqueueMapBuffer(cmd_queue_, input1Buffer_, true, CL_MAP_WRITE, 0,
|
|
|
|
|
input1BufferSize_, 0, NULL, NULL, &error_);
|
|
|
|
|
if (input2BufferSize_) {
|
|
|
|
|
input2Ptr_ =
|
|
|
|
|
(float*)_wrapper->clEnqueueMapBuffer(cmd_queue_, input2Buffer_, true, CL_MAP_WRITE, 0,
|
|
|
|
|
input2BufferSize_, 0, NULL, NULL, &error_);
|
|
|
|
|
}
|
|
|
|
|
output1Ptr_ =
|
|
|
|
|
(float*)_wrapper->clEnqueueMapBuffer(cmd_queue_, output1Buffer_, true, CL_MAP_READ, 0,
|
|
|
|
|
output1BufferSize_, 0, NULL, NULL, &error_);
|
|
|
|
|
_wrapper->clFinish(cmd_queue_);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case 1: // device memory
|
|
|
|
|
// printf("Allocating Device Memories\n");
|
|
|
|
|
// allocate device memory
|
|
|
|
|
inputBufferFlags = CL_MEM_READ_WRITE;
|
|
|
|
|
outputBufferFlags = CL_MEM_READ_WRITE;
|
|
|
|
|
input1Buffer_ =
|
|
|
|
|
_wrapper->clCreateBuffer(context_, inputBufferFlags, input1BufferSize_, NULL, &error_);
|
|
|
|
|
CHECK_RESULT(input1Buffer_ == 0, "clCreateBuffer Input failed");
|
|
|
|
|
if (input2BufferSize_) {
|
|
|
|
|
input2Buffer_ =
|
|
|
|
|
_wrapper->clCreateBuffer(context_, inputBufferFlags, input2BufferSize_, NULL, &error_);
|
|
|
|
|
CHECK_RESULT(input2Buffer_ == 0, "clCreateBuffer Input failed");
|
|
|
|
|
}
|
|
|
|
|
output1Buffer_ =
|
|
|
|
|
_wrapper->clCreateBuffer(context_, outputBufferFlags, output1BufferSize_, NULL, &error_);
|
|
|
|
|
CHECK_RESULT(output1Buffer_ == 0, "clCreateBuffer Input failed");
|
|
|
|
|
// printf("\tDone Allocating Device Memory\n");
|
|
|
|
|
|
|
|
|
|
// allocate host memory
|
|
|
|
|
input1Ptr_ = new float[input1BufferSize_ / sizeof(float)];
|
|
|
|
|
if (input2BufferSize_) {
|
|
|
|
|
input2Ptr_ = new float[input2BufferSize_ / sizeof(float)];
|
|
|
|
|
}
|
|
|
|
|
output1Ptr_ = new float[output1BufferSize_ / sizeof(float)];
|
|
|
|
|
// printf("\tDone Allocating Host Memory\n");
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
CHECK_RESULT(1, "Invalid Memory Path Idx");
|
|
|
|
|
// invalid
|
|
|
|
|
}
|
|
|
|
|
for (unsigned int i = 0; i < input1BufferSize_ / sizeof(float); i++) {
|
|
|
|
|
input1Ptr_[i] = initFloat;
|
|
|
|
|
}
|
|
|
|
|
for (unsigned int i = 0; i < input2BufferSize_ / sizeof(float); i++) {
|
|
|
|
|
input2Ptr_[i] = initFloat;
|
|
|
|
|
}
|
|
|
|
|
for (unsigned int i = 0; i < output1BufferSize_ / sizeof(float); i++) {
|
|
|
|
|
output1Ptr_[i] = zeroFloat;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
|
printf("Allocating GPU: %.0fMB, %.0fMB\n",
|
|
|
|
|
static_cast<float>(1.f*input1BufferSize_/1024.f/1024.f),
|
|
|
|
|
static_cast<float>(1.f*output1BufferSize_/1024.f/1024.f));
|
|
|
|
|
input1Buffer_ = _wrapper->clCreateBuffer(
|
|
|
|
|
context_, inputBufferFlags, input1BufferSize_, NULL, &error_);
|
|
|
|
|
CHECK_RESULT(input1Buffer_ == 0, "clCreateBuffer Input failed");
|
|
|
|
|
output1Buffer_ = _wrapper->clCreateBuffer(
|
|
|
|
|
context_, outputBufferFlags, output1BufferSize_, NULL, &error_);
|
|
|
|
|
CHECK_RESULT(output1Buffer_ == 0, "clCreateBuffer Output failed");
|
|
|
|
|
error_ = /*_wrapper->*/clEnqueueFillBuffer(
|
|
|
|
|
cmd_queue_, input1Buffer_, &initFloat, sizeof(initFloat),
|
|
|
|
|
0, input1BufferSize_, 0, NULL, NULL);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueFillBuffer failed");
|
|
|
|
|
error_ = /*_wrapper->*/clEnqueueFillBuffer(
|
|
|
|
|
cmd_queue_, output1Buffer_, &zeroFloat, sizeof(zeroFloat),
|
|
|
|
|
0, output1BufferSize_, 0, NULL, NULL);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueFillBuffer failed");
|
|
|
|
|
|
|
|
|
|
/***********************************************************
|
|
|
|
|
* Set Kernel Args
|
|
|
|
|
**********************************************************/
|
|
|
|
|
error_ = _wrapper->clSetKernelArg(
|
|
|
|
|
kernel_, 0, sizeof(input1Buffer_), (void *) &input1Buffer_);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
|
|
|
|
|
error_ = _wrapper->clSetKernelArg(
|
|
|
|
|
kernel_, 1, sizeof(output1Buffer_), (void *) &output1Buffer_);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
EXIT("open");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*******************************************************************************
|
|
|
|
|
* Run - execute full test once and return performance
|
|
|
|
|
******************************************************************************/
|
|
|
|
|
void OCLPerfKernelThroughput::run(void) {
|
|
|
|
|
ENTER("run");
|
|
|
|
|
CPerfCounter timer;
|
|
|
|
|
if (!_dataSizeTooBig) {
|
|
|
|
|
// set kernel args
|
|
|
|
|
#if 1
|
|
|
|
|
switch (kernelTypeIdx_) {
|
|
|
|
|
case 0: // Matrix Multiply
|
|
|
|
|
error_ =
|
|
|
|
|
_wrapper->clSetKernelArg(kernel_, 0, sizeof(output1Buffer_), (void*)&output1Buffer_);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
|
|
|
|
|
error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(input1Buffer_), (void*)&input1Buffer_);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
|
|
|
|
|
error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(input2Buffer_), (void*)&input2Buffer_);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
|
|
|
|
|
break;
|
|
|
|
|
case 1: // Flops/Byte
|
|
|
|
|
error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(input1Buffer_), (void*)&input1Buffer_);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
|
|
|
|
|
error_ =
|
|
|
|
|
_wrapper->clSetKernelArg(kernel_, 1, sizeof(output1Buffer_), (void*)&output1Buffer_);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
launchKernel();
|
|
|
|
|
timer.Reset();
|
|
|
|
|
timer.Start();
|
|
|
|
|
for (int i = 0; i < MAX_LOOP_ITER; i++) {
|
|
|
|
|
launchKernel();
|
|
|
|
|
}
|
|
|
|
|
timer.Stop();
|
|
|
|
|
} // data not too large
|
|
|
|
|
double totalSec = _dataSizeTooBig ? 1 : timer.GetElapsedTime();
|
|
|
|
|
// printf("Total Time: %f seconds\n", totalSec);
|
|
|
|
|
// printf("Average Kernel Time: %f seconds\n", totalSec / MAX_LOOP_ITER);
|
|
|
|
|
|
|
|
|
|
// analyze performance
|
|
|
|
|
avgKernelTime_ = (float)(totalSec / MAX_LOOP_ITER * 1000000); // microseconds
|
|
|
|
|
double flopCount;
|
|
|
|
|
switch (kernelTypeIdx_) {
|
|
|
|
|
case 0: // Matrix Multiply
|
|
|
|
|
flopCount = (2.0 * matrixDim1_ * matrixDim1_ * matrixDim2_);
|
|
|
|
|
// printf("FlopCount = 2*%i*%i*%i=%f\n",
|
|
|
|
|
// matrixDim1_,matrixDim1_,matrixDim2_,flopCount);
|
|
|
|
|
bandwidth_ = (float)(1.f * _reqDataSize / 1024.f / 1024.f / 1024.f) * 1000000.f /
|
|
|
|
|
avgKernelTime_; // GB/s
|
|
|
|
|
gflops_ = (float)(1000000.f * flopCount / avgKernelTime_ / 1000000000.0);
|
|
|
|
|
break;
|
|
|
|
|
case 1: // Madds
|
|
|
|
|
flopCount = _reqDataSize * flopsPerByte_;
|
|
|
|
|
bandwidth_ = (float)(1.f * _reqDataSize / 1024.f / 1024.f / 1024.f) * 1000000.f /
|
|
|
|
|
avgKernelTime_; // GB/s
|
|
|
|
|
gflops_ = bandwidth_ * flopsPerByte_;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
if (_dataSizeTooBig) {
|
|
|
|
|
printf("REQUESTED DATA SIZE EXCEEDS GLOBAL MEMORY !!!\n");
|
|
|
|
|
bandwidth_ = 0;
|
|
|
|
|
gflops_ = 0;
|
|
|
|
|
avgKernelTime_ = 0;
|
|
|
|
|
}
|
|
|
|
|
// here print out details
|
|
|
|
|
char buf[512];
|
|
|
|
|
int bytesWritten;
|
|
|
|
|
bytesWritten = SNPRINTF(buf, sizeof(buf),
|
|
|
|
|
"Kernel:%7s; "
|
|
|
|
|
"Work:%4i; "
|
|
|
|
|
"Buff:%11.0f; "
|
|
|
|
|
"Path:%7s; "
|
|
|
|
|
"%10.5e GB/s; "
|
|
|
|
|
"%10.5e GFlop/s; ",
|
|
|
|
|
kernelType[kernelTypeIdx_], static_cast<int>(workSize[workSizeIdx_]),
|
|
|
|
|
_reqDataSize, memPath[memPathIdx_], bandwidth_, gflops_);
|
|
|
|
|
testDescString = buf;
|
|
|
|
|
_perfInfo = avgKernelTime_;
|
|
|
|
|
if (!_dataSizeTooBig) checkData();
|
|
|
|
|
EXIT("run");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void OCLPerfKernelThroughput::launchKernel(void) {
|
|
|
|
|
ENTER("launchKernel")
|
|
|
|
|
/***********************************************************
|
|
|
|
|
* Copy Data To
|
|
|
|
|
**********************************************************/
|
|
|
|
|
// printf("Copying Data To Device\n");
|
|
|
|
|
switch (memPathIdx_) {
|
|
|
|
|
case 0: // zero copy
|
|
|
|
|
// do nothing
|
|
|
|
|
// void *inputPtr = _wrapper->clEnqueueMapBuffer(
|
|
|
|
|
// cmd_queue_, input1Buffer_, true, CL_MAP_READ,
|
|
|
|
|
// 0, input1BufferSize_, 0, NULL, NULL, &error_);
|
|
|
|
|
// void *outputPtr = _wrapper->clEnqueueMapBuffer(
|
|
|
|
|
// cmd_queue_, output1Buffer_, true, CL_MAP_READ,
|
|
|
|
|
// 0, output1BufferSize_, 0, NULL, NULL, &error_);
|
|
|
|
|
//_wrapper->clFinish(cmd_queue_);
|
|
|
|
|
break;
|
|
|
|
|
case 1: // explicit copy to device memory
|
|
|
|
|
// printf("Queue: %p\n", &cmd_queue_);
|
|
|
|
|
// printf("devBuffer: %i\n", input1Buffer_);
|
|
|
|
|
// printf("hstBuffer: %p\n", input1Ptr_);
|
|
|
|
|
// printf("bufSize: %i\n", input1BufferSize_);
|
|
|
|
|
error_ = _wrapper->clEnqueueWriteBuffer(cmd_queue_, input1Buffer_, true, 0, input1BufferSize_,
|
|
|
|
|
(const void*)input1Ptr_, 0, NULL, NULL);
|
|
|
|
|
if (input2BufferSize_) {
|
|
|
|
|
error_ =
|
|
|
|
|
_wrapper->clEnqueueWriteBuffer(cmd_queue_, input2Buffer_, true, 0, input2BufferSize_,
|
|
|
|
|
(const void*)input2Ptr_, 0, NULL, NULL);
|
|
|
|
|
}
|
|
|
|
|
// printf("Error: %i\n", error_);
|
|
|
|
|
std::fflush(stdout);
|
|
|
|
|
_wrapper->clFinish(cmd_queue_);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clWriteBuffer failed");
|
|
|
|
|
//_error = _wrapper->clEnqueueWriteBuffer(
|
|
|
|
|
// cmd_queue_, output1Buffer_, true, 0, output1BufferSize_,
|
|
|
|
|
// (const void *)output1Ptr_, 0, NULL, NULL );
|
|
|
|
|
// CHECK_RESULT(error_ != CL_SUCCESS, "clWriteBuffer failed");
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/***********************************************************
|
|
|
|
|
* Set Kernel Args
|
|
|
|
|
**********************************************************/
|
|
|
|
|
#if 0
|
|
|
|
|
error_ = _wrapper->clSetKernelArg(
|
|
|
|
|
kernel_, 0, sizeof(input1Buffer_), (void *) &input1Buffer_);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
|
|
|
|
|
error_ = _wrapper->clSetKernelArg(
|
|
|
|
|
kernel_, 1, sizeof(output1Buffer_), (void *) &output1Buffer_);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed");
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// printf("Launching Kernel: %ix%i threads\n", global_work_size_[0],
|
|
|
|
|
// local_work_size_[0]);
|
|
|
|
|
|
|
|
|
|
/***********************************************************
|
|
|
|
|
* Launch Kernel
|
|
|
|
|
**********************************************************/
|
|
|
|
|
error_ = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, kernel_, work_dim_, NULL,
|
|
|
|
|
(const size_t*)global_work_size_,
|
|
|
|
|
(const size_t*)local_work_size_, 0, NULL, NULL);
|
|
|
|
|
// printf("Error: %i\n", error_);
|
|
|
|
|
CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
|
|
|
|
|
_wrapper->clFinish(cmd_queue_);
|
|
|
|
|
|
|
|
|
|
/***********************************************************
|
|
|
|
|
* Copy Data From
|
|
|
|
|
**********************************************************/
|
|
|
|
|
// printf("Copying Data From Device\n");
|
|
|
|
|
switch (memPathIdx_) {
|
|
|
|
|
case 0: // zero copy
|
|
|
|
|
// do nothing
|
|
|
|
|
// void *inputPtr = _wrapper->clEnqueueMapBuffer(
|
|
|
|
|
// cmd_queue_, input1Buffer_, true, CL_MAP_READ,
|
|
|
|
|
// 0, input1BufferSize_, 0, NULL, NULL, &error_);
|
|
|
|
|
// void *outputPtr = _wrapper->clEnqueueMapBuffer(
|
|
|
|
|
// cmd_queue_, output1Buffer_, true, CL_MAP_READ,
|
|
|
|
|
// 0, output1BufferSize_, 0, NULL, NULL, &error_);
|
|
|
|
|
//_wrapper->clFinish(cmd_queue_);
|
|
|
|
|
break;
|
|
|
|
|
case 1: // explicit copy to device memory
|
|
|
|
|
//_error = _wrapper->clEnqueueReadBuffer(
|
|
|
|
|
// cmd_queue_, input1Buffer_, true, 0, input1BufferSize_,
|
|
|
|
|
// (void *)input1Ptr_, 0, NULL, NULL );
|
|
|
|
|
// CHECK_RESULT(error_ != CL_SUCCESS, "clWriteBuffer failed");
|
|
|
|
|
// printf("VAL0 %p
|
|
|
|
|
error_ = _wrapper->clEnqueueReadBuffer(cmd_queue_, output1Buffer_, true, 0,
|
|
|
|
|
output1BufferSize_, (void*)output1Ptr_, 0, NULL, NULL);
|
|
|
|
|
// printf("Error: %i\n", error_);
|
|
|
|
|
CHECK_RESULT(error_ != CL_SUCCESS, "clWriteBuffer failed");
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
EXIT("launchKernel")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*******************************************************************************
|
|
|
|
|
* Check Data
|
|
|
|
|
******************************************************************************/
|
|
|
|
|
void OCLPerfKernelThroughput::checkData() {
|
|
|
|
|
_wrapper->clFinish(cmd_queue_);
|
|
|
|
|
float errorThreshhold = 0.00001f;
|
|
|
|
|
float eqMax = gold_ + errorThreshhold * gold_;
|
|
|
|
|
float eqMin = gold_ - errorThreshhold * gold_;
|
|
|
|
|
/*
|
|
|
|
|
printf("%ix%i * %ix%i = %ix%i:\n",
|
|
|
|
|
matrixDim1_, matrixDim2_,
|
|
|
|
|
matrixDim2_, matrixDim1_,
|
|
|
|
|
matrixDim1_, matrixDim1_
|
|
|
|
|
);
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned int i = 0; i < output1BufferSize_ / sizeof(float); i++) {
|
|
|
|
|
float value = output1Ptr_[i];
|
|
|
|
|
bool equal = (value > eqMin && value < eqMax);
|
|
|
|
|
if (!equal) {
|
|
|
|
|
#if 0
|
|
|
|
|
printf("Output[%i] = %.6e; gold_ = %.6e; %s\n",
|
|
|
|
|
i,
|
|
|
|
|
value,
|
|
|
|
|
gold_,
|
|
|
|
|
equal ? "Equal" : "NOT Equal");
|
|
|
|
|
#endif
|
|
|
|
|
// printf("FAILURE\n");
|
|
|
|
|
// CHECK_RESULT_NO_RETURN(1, "Data validation failed!\n");
|
|
|
|
|
_errorFlag = true;
|
|
|
|
|
break;
|
|
|
|
|
} else {
|
|
|
|
|
// printf("M[%i] = %.6e\n", i, output1Ptr_[i]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*******************************************************************************
|
|
|
|
|
* Close - delete all data and release opencl objects
|
|
|
|
|
******************************************************************************/
|
|
|
|
|
unsigned int OCLPerfKernelThroughput::close(void) {
|
|
|
|
|
ENTER("close");
|
|
|
|
|
_wrapper->clFinish(cmd_queue_);
|
|
|
|
|
|
|
|
|
|
if (global_work_size_) {
|
|
|
|
|
delete[] global_work_size_;
|
|
|
|
|
global_work_size_ = NULL;
|
|
|
|
|
}
|
|
|
|
|
if (local_work_size_) {
|
|
|
|
|
delete[] local_work_size_;
|
|
|
|
|
local_work_size_ = NULL;
|
|
|
|
|
}
|
|
|
|
|
// switch for memory type
|
|
|
|
|
switch (memPathIdx_) {
|
|
|
|
|
case 0: // zero copy
|
|
|
|
|
// unmap ptr
|
|
|
|
|
if (input1Ptr_) {
|
|
|
|
|
error_ = /*_wrapper->*/ clEnqueueUnmapMemObject(cmd_queue_, input1Buffer_, input1Ptr_, 0,
|
|
|
|
|
NULL, NULL);
|
|
|
|
|
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clEnqueueUnmapMemObject(input_) failed");
|
|
|
|
|
_wrapper->clFinish(cmd_queue_);
|
|
|
|
|
error_ = _wrapper->clReleaseMemObject(input1Buffer_);
|
|
|
|
|
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject(input1Buffer_) failed");
|
|
|
|
|
input1Buffer_ = 0;
|
|
|
|
|
}
|
|
|
|
|
if (input2Ptr_) {
|
|
|
|
|
error_ = /*_wrapper->*/ clEnqueueUnmapMemObject(cmd_queue_, input2Buffer_, input2Ptr_, 0,
|
|
|
|
|
NULL, NULL);
|
|
|
|
|
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clEnqueueUnmapMemObject(input_) failed");
|
|
|
|
|
_wrapper->clFinish(cmd_queue_);
|
|
|
|
|
error_ = _wrapper->clReleaseMemObject(input2Buffer_);
|
|
|
|
|
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject(input2Buffer_) failed");
|
|
|
|
|
input2Buffer_ = 0;
|
|
|
|
|
}
|
|
|
|
|
if (output1Ptr_) {
|
|
|
|
|
error_ = /*_wrapper->*/ clEnqueueUnmapMemObject(cmd_queue_, output1Buffer_, output1Ptr_, 0,
|
|
|
|
|
NULL, NULL);
|
|
|
|
|
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clEnqueueUnmapMemObject(output_) failed");
|
|
|
|
|
_wrapper->clFinish(cmd_queue_);
|
|
|
|
|
error_ = _wrapper->clReleaseMemObject(output1Buffer_);
|
|
|
|
|
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject(input1Buffer_) failed");
|
|
|
|
|
output1Buffer_ = 0;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
case 1: // explicit copy to device memory
|
|
|
|
|
// release object
|
|
|
|
|
if (input1Buffer_) {
|
|
|
|
|
error_ = _wrapper->clReleaseMemObject(input1Buffer_);
|
|
|
|
|
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject(input1Buffer_) failed");
|
|
|
|
|
input1Buffer_ = 0;
|
|
|
|
|
}
|
|
|
|
|
if (input2Buffer_) {
|
|
|
|
|
error_ = _wrapper->clReleaseMemObject(input2Buffer_);
|
|
|
|
|
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject(input2Buffer_) failed");
|
|
|
|
|
input2Buffer_ = 0;
|
|
|
|
|
}
|
|
|
|
|
if (output1Buffer_) {
|
|
|
|
|
error_ = _wrapper->clReleaseMemObject(output1Buffer_);
|
|
|
|
|
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject(input1Buffer_) failed");
|
|
|
|
|
output1Buffer_ = 0;
|
|
|
|
|
}
|
|
|
|
|
if (input1Ptr_) {
|
|
|
|
|
delete[] input1Ptr_;
|
|
|
|
|
input1Ptr_ = 0;
|
|
|
|
|
}
|
|
|
|
|
if (input2Ptr_) {
|
|
|
|
|
delete[] input2Ptr_;
|
|
|
|
|
input2Ptr_ = 0;
|
|
|
|
|
}
|
|
|
|
|
if (output1Ptr_) {
|
|
|
|
|
delete[] output1Ptr_;
|
|
|
|
|
output1Ptr_ = 0;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (kernel_) {
|
|
|
|
|
error_ = _wrapper->clReleaseKernel(kernel_);
|
|
|
|
|
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
|
|
|
|
|
kernel_ = 0;
|
|
|
|
|
}
|
|
|
|
|
if (program_) {
|
|
|
|
|
error_ = _wrapper->clReleaseProgram(program_);
|
|
|
|
|
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
|
|
|
|
|
program_ = 0;
|
|
|
|
|
}
|
|
|
|
|
if (cmd_queue_) {
|
|
|
|
|
error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
|
|
|
|
|
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseCommandQueue failed");
|
|
|
|
|
cmd_queue_ = 0;
|
|
|
|
|
}
|
|
|
|
|
if (context_) {
|
|
|
|
|
error_ = _wrapper->clReleaseContext(context_);
|
|
|
|
|
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
|
|
|
|
|
context_ = 0;
|
|
|
|
|
}
|
|
|
|
|
_wrapper->clFinish(cmd_queue_);
|
|
|
|
|
|
|
|
|
|
EXIT("close");
|
|
|
|
|
return _crcword;
|
|
|
|
|
}
|