Initial source drop of ocltst

This only adds source files for ocltst and the following test modules - oclruntime, oclperf, oclgl, ocldx. There's no build files for now.

Change-Id: I0f8d9d074c45d82e92f7d30bf22753102f272f4f


[ROCm/clr commit: 75e6add24d]
This commit is contained in:
Vlad Sytchenko
2020-05-29 12:10:04 -04:00
zatwierdzone przez Vladislav Sytchenko
rodzic a5f661537b
commit 18ce996fe2
290 zmienionych plików z 54116 dodań i 0 usunięć
+54
Wyświetl plik
@@ -0,0 +1,54 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef OCL_TEST_MODULE_H
#define OCL_TEST_MODULE_H
#include <string>
#include "OCLTest.h"
#include "OCLTestList.h"
struct Module {
std::string name;
ModuleHandle hmodule;
TestCountFuncPtr get_count;
TestNameFuncPtr get_name;
CreateTestFuncPtr create_test;
DestroyTestFuncPtr destroy_test;
TestVersionFuncPtr get_version;
TestLibNameFuncPtr get_libname;
OCLTest** cached_test;
Module()
: name(""),
hmodule(0),
get_count(0),
get_name(0),
create_test(0),
destroy_test(0),
get_version(0),
get_libname(0),
cached_test(0) {
// EMPTY!
}
};
#endif // OCL_TEST_MODULE_H
@@ -0,0 +1,71 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _RESULT_STRUCT_H_
struct IndicesRange {
int startIndex;
int endIndex;
};
#define INDEX_ALL_TESTS -1
#define EXTREMELY_SMALL_VALUE -10000.0f
#define EXTREMELY_LARGE_VALUE 10000.0f
class TestResult {
public:
float value;
std::string resultString;
bool passed;
TestResult(float val) : resultString("\n"), passed(true) { value = val; }
void reset(float val) {
value = val;
passed = true;
resultString.assign("\n");
}
};
class Report {
public:
TestResult *max;
TestResult *min;
bool success;
int numFailedTests;
Report() : success(true), numFailedTests(0) {
max = new TestResult(EXTREMELY_SMALL_VALUE);
min = new TestResult(EXTREMELY_LARGE_VALUE);
}
void reset() {
max->reset(EXTREMELY_SMALL_VALUE);
min->reset(EXTREMELY_LARGE_VALUE);
success = true;
numFailedTests = 0;
}
~Report() {
delete max;
delete min;
}
};
#endif // _RESULT_STRUCT_H_
+111
Wyświetl plik
@@ -0,0 +1,111 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "Timer.h"
#ifdef ATI_OS_WIN
#include <windows.h>
#endif
#ifdef ATI_OS_LINUX
#include <sys/time.h>
#endif
CPerfCounter::CPerfCounter() : _clocks(0), _start(0) {
#ifdef ATI_OS_WIN
QueryPerformanceFrequency((LARGE_INTEGER *)&_freq);
#endif
#ifdef ATI_OS_LINUX
_freq = 1000;
#endif
}
CPerfCounter::~CPerfCounter() {
// EMPTY!
}
void CPerfCounter::Start(void) {
#ifdef ATI_OS_WIN
if (_start) {
MessageBox(NULL, "Bad Perf Counter Start", "Error", MB_OK);
exit(0);
}
QueryPerformanceCounter((LARGE_INTEGER *)&_start);
#endif
#ifdef ATI_OS_LINUX
struct timeval s;
gettimeofday(&s, 0);
_start = (i64)s.tv_sec * 1000 + (i64)s.tv_usec / 1000;
#endif
}
void CPerfCounter::Stop(void) {
i64 n;
#ifdef ATI_OS_WIN
if (!_start) {
MessageBox(NULL, "Bad Perf Counter Stop", "Error", MB_OK);
exit(0);
}
QueryPerformanceCounter((LARGE_INTEGER *)&n);
#endif
#ifdef ATI_OS_LINUX
struct timeval s;
gettimeofday(&s, 0);
n = (i64)s.tv_sec * 1000 + (i64)s.tv_usec / 1000;
#endif
n -= _start;
_start = 0;
_clocks += n;
}
void CPerfCounter::Reset(void) {
#ifdef ATI_OS_WIN
if (_start) {
MessageBox(NULL, "Bad Perf Counter Reset", "Error", MB_OK);
exit(0);
}
#endif
_clocks = 0;
}
double CPerfCounter::GetElapsedTime(void) {
#ifdef ATI_OS_WIN
if (_start) {
MessageBox(NULL, "Trying to get time while still running.", "Error", MB_OK);
exit(0);
}
#endif
return (double)_clocks / (double)_freq;
}
+46
Wyświetl plik
@@ -0,0 +1,46 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _TIMER_H_
#define _TIMER_H_
#ifdef ATI_OS_WIN
typedef __int64 i64;
#endif
#ifdef ATI_OS_LINUX
typedef long long i64;
#endif
class CPerfCounter {
public:
CPerfCounter();
~CPerfCounter();
void Start(void);
void Stop(void);
void Reset(void);
double GetElapsedTime(void);
private:
i64 _freq;
i64 _clocks;
i64 _start;
};
#endif // _TIMER_H_
+180
Wyświetl plik
@@ -0,0 +1,180 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef OCL_TEST_WORKER_H
#define OCL_TEST_WORKER_H
/////////////////////////////////////////////////////////////////////////////
#include <assert.h>
#include <stdio.h>
#include <cstring>
#include <memory>
#include <string>
#include <vector>
#include "Module.h"
#include "OCLTest.h"
#include "OCLTestList.h"
#include "ResultStruct.h"
#include "Timer.h"
#include "getopt.h"
#include "pfm.h"
/////////////////////////////////////////////////////////////////////////////
typedef void* (*TestMethod)(void* param);
/////////////////////////////////////////////////////////////////////////////
class Worker {
public:
Worker()
: m_wrapper(0),
m_module(0),
m_run(0),
m_id(0),
m_subtest(0),
m_testindex(0),
m_dump(false),
m_display(false),
m_useCPU(false),
m_window(0),
m_width(0),
m_height(0),
m_buffer(0),
m_perflab(false),
m_deviceId(0),
m_platform(0) {
// EMPTY!
}
Worker(OCLWrapper* wrapper, Module* module, TestMethod run, unsigned int id,
unsigned int subtest, unsigned int testindex, bool dump, bool view,
bool useCPU, void* window, unsigned int x, unsigned int y,
bool perflab, unsigned int deviceId = 0, unsigned int platform = 0)
: m_wrapper(wrapper),
m_module(module),
m_run(run),
m_id(id),
m_subtest(subtest),
m_testindex(testindex),
m_dump(dump),
m_display(view),
m_useCPU(useCPU),
m_window(window),
m_width(x),
m_height(y),
m_buffer(0),
m_perflab(perflab),
m_deviceId(deviceId),
m_platform(platform) {
if (m_dump == true || m_display == true) {
m_buffer = new float[4 * m_width * m_height];
if (m_buffer != 0) {
memset(m_buffer, 0, 4 * m_width * m_height * sizeof(float));
} else {
m_dump = false;
m_display = false;
}
}
m_result = new TestResult(0.0f);
}
Worker(const Worker& w) {
if (this == &w) return;
if (m_buffer) delete[] m_buffer;
m_buffer = 0;
m_wrapper = w.m_wrapper;
m_module = w.m_module;
m_run = w.m_run;
m_id = w.m_id;
m_subtest = w.m_subtest;
m_testindex = w.m_testindex;
m_dump = w.m_dump;
m_display = w.m_display;
m_useCPU = w.m_useCPU;
m_window = w.m_window;
m_width = w.m_width;
m_height = w.m_height;
m_perflab = w.m_perflab;
m_deviceId = w.m_deviceId;
m_result = w.m_result;
m_platform = w.m_platform;
if (w.m_buffer) {
m_buffer = new float[4 * m_width * m_height];
if (m_buffer != 0) {
memcpy(m_buffer, w.m_buffer, 4 * m_width * m_height * sizeof(float));
}
}
}
~Worker() {
if (m_buffer) delete[] m_buffer;
m_buffer = 0;
delete m_result;
m_result = 0;
}
OCLWrapper* getOCLWrapper() { return m_wrapper; }
Module* getModule() { return m_module; }
TestMethod getTestMethod() { return m_run; }
unsigned int getId() { return m_id; }
unsigned int getSubTest() { return m_subtest; }
unsigned int getTestIndex() { return m_testindex; }
bool isDumpEnabled() { return m_dump; }
bool isDisplayEnabled() { return m_display; }
bool isCPUEnabled() { return m_useCPU; }
void* getWindow() { return m_window; }
unsigned int getWidth() { return m_width; }
unsigned int getHeight() { return m_height; }
float* getBuffer() { return m_buffer; }
bool getPerflab() { return m_perflab; }
unsigned int getDeviceId() { return m_deviceId; }
TestResult* getResult() { return m_result; }
unsigned int getPlatformID() { return m_platform; }
private:
OCLWrapper* m_wrapper;
Module* m_module;
TestMethod m_run;
unsigned int m_id;
unsigned int m_subtest;
unsigned int m_testindex;
bool m_dump;
bool m_display;
bool m_useCPU;
void* m_window;
unsigned int m_width;
unsigned int m_height;
float* m_buffer;
bool m_perflab;
unsigned int m_deviceId;
unsigned int m_platform;
TestResult* m_result;
};
/////////////////////////////////////////////////////////////////////////////
#endif // OCL_TEST_WORKER_H
+162
Wyświetl plik
@@ -0,0 +1,162 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "oclsysinfo.h"
#include <CL/cl.h>
#include <CL/cl_ext.h>
#include <cstdio>
#ifndef MAX_DEVICES
#define MAX_DEVICES 16
#endif // MAX_DEVICES
int oclSysInfo(std::string &info_string, bool use_cpu, unsigned dev_id,
unsigned int platformIndex) {
/*
* Have a look at the available platforms and pick the one
* in the platforms vector in index "platformIndex".
*/
cl_uint numPlatforms;
cl_platform_id platform = NULL;
cl_uint num_devices = 0;
cl_device_id *devices = NULL;
cl_device_id device = NULL;
int error = clGetPlatformIDs(0, NULL, &numPlatforms);
if (CL_SUCCESS != error) {
fprintf(stderr, "clGetPlatformIDs() failed");
return 0;
}
if (0 < numPlatforms) {
cl_platform_id *platforms = new cl_platform_id[numPlatforms];
error = clGetPlatformIDs(numPlatforms, platforms, NULL);
if (CL_SUCCESS != error) {
fprintf(stderr, "clGetPlatformIDs() failed");
return 0;
}
#if 0
for (unsigned i = 0; i < numPlatforms; ++i) {
/* Get the number of requested devices */
error = clGetDeviceIDs(platforms[i], (use_cpu) ? CL_DEVICE_TYPE_CPU : CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices );
#if 0
/* clGetDeviceIDs fails when no GPU devices are present */
if (error) {
fprintf(stderr, "clGetDeviceIDs failed: %d\n", error );
return 0;
}
#endif
#if 0
char pbuf[100];
error = clGetPlatformInfo(
platforms[i],
CL_PLATFORM_VENDOR,
sizeof(pbuf),
pbuf,
NULL);
if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
platform = platforms[i];
break;
}
#else
/* Select platform with GPU devices present */
if (num_devices > 0) {
platform = platforms[i];
break;
}
#endif
}
#endif
error = clGetDeviceIDs(platforms[platformIndex],
(use_cpu) ? CL_DEVICE_TYPE_CPU : CL_DEVICE_TYPE_GPU,
0, NULL, &num_devices);
if (error) {
fprintf(stderr, "clGetDeviceIDs failed: %d\n", error);
return 0;
}
platform = platforms[platformIndex];
delete[] platforms;
}
if (dev_id >= num_devices) {
fprintf(stderr, "Device selected does not exist.\n");
return 0;
}
if (NULL == platform) {
fprintf(stderr,
"Couldn't find platform with GPU devices, cannot proceed.\n");
return 0;
}
devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
if (!devices) {
fprintf(stderr, "no devices\n");
return 0;
}
/* Get the requested device */
error = clGetDeviceIDs(platform,
(use_cpu) ? CL_DEVICE_TYPE_CPU : CL_DEVICE_TYPE_GPU,
num_devices, devices, NULL);
if (error) {
fprintf(stderr, "clGetDeviceIDs failed: %d\n", error);
return 0;
}
device = devices[dev_id];
char c[1024];
char tmpString[256];
static const char *no_yes[] = {"NO", "YES"};
sprintf(tmpString, "\nCompute Device info:\n");
info_string.append(tmpString);
clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(c), &c, NULL);
sprintf(tmpString, "\tPlatform Version: %s\n", c);
info_string.append(tmpString);
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(c), &c, NULL);
sprintf(tmpString, "\tDevice Name: %s\n", c);
info_string.append(tmpString);
clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(c), &c, NULL);
sprintf(tmpString, "\tVendor: %s\n", c);
info_string.append(tmpString);
clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(c), &c, NULL);
sprintf(tmpString, "\tDevice Version: %s\n", c);
info_string.append(tmpString);
clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(c), &c, NULL);
sprintf(tmpString, "\tDriver Version: %s\n", c);
info_string.append(tmpString);
clGetDeviceInfo(device, CL_DEVICE_BOARD_NAME_AMD, sizeof(c), &c, NULL);
sprintf(tmpString, "\tBoard Name: %s\n", c);
info_string.append(tmpString);
#if defined(ATI_OS_LINUX)
cl_device_topology_amd topology;
clGetDeviceInfo(device, CL_DEVICE_TOPOLOGY_AMD, sizeof(topology), &topology,
NULL);
if (topology.raw.type == CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD) {
sprintf(tmpString, "\tDevice Topology: PCI[ B#%d, D#%d, F#%d]\n",
topology.pcie.bus, topology.pcie.device, topology.pcie.function);
info_string.append(tmpString);
}
#endif
free(devices);
return 1;
}
+28
Wyświetl plik
@@ -0,0 +1,28 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCLSYSINFO_H_
#define _OCLSYSINFO_H_
#include <string>
int oclSysInfo(std::string& info_string, bool useCPU, unsigned dev_id,
unsigned int platformIndex = 0);
#endif //_OCLSYSINFO_H_
Plik diff jest za duży Load Diff
+79
Wyświetl plik
@@ -0,0 +1,79 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "pfm.h"
#ifdef ATI_OS_WIN
#include <io.h>
#endif
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
unsigned int SavePFM(const char* filename, const float* buffer,
unsigned int width, unsigned int height,
unsigned int components) {
unsigned int error = 0;
//
// open the image file for writing
//
FILE* fh;
if ((fh = fopen(filename, "wb")) == NULL) {
return 1;
}
//
// write the PFM header
//
#define PFMEOL "\x0a"
fprintf(fh, "PF" PFMEOL "%d %d" PFMEOL "-1" PFMEOL, width, height);
fflush(fh);
//
// write each scanline
//
const unsigned int lineSize = width * 3;
float line[3 * 4096];
for (unsigned int y = height; y > 0; y--) {
const float* v = buffer + components * width * (y - 1);
for (unsigned int x = 0; x < width; x++) {
line[x * 3 + 0] = v[x * components + 0];
line[x * 3 + 1] =
(components > 1) ? v[x * components + 1] : v[x * components + 0];
line[x * 3 + 2] =
(components > 2) ? v[x * components + 2] : v[x * components + 0];
}
unsigned int written =
(unsigned int)fwrite(line, (unsigned int)sizeof(float), lineSize, fh);
if (written != lineSize) {
error = 1;
break;
}
fflush(fh);
}
fflush(fh);
fclose(fh);
return error;
}
+28
Wyświetl plik
@@ -0,0 +1,28 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _PFM_H_
#define _PFM_H_
extern unsigned int SavePFM(const char* filename, const float* buffer,
unsigned int width, unsigned int height,
unsigned int components);
#endif // _PFM_H_
@@ -0,0 +1,148 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef OCL_THREAD_H
#define OCL_THREAD_H
//!
//! \file Thread.h
//!
#ifdef ATI_OS_WIN
#ifndef _WIN32_WINNT
#define _WIN32_WINNT 0x0501
#endif
#include "windows.h"
#else
#include "pthread.h"
#endif
//! Entry point for the thread
//! prototype of the entry point in windows
typedef void *(*oclThreadFunc)(void *);
namespace OCLutil {
//! \class Lock
//! \brief Provides a wrapper for locking primitives used to
//! synchronize _CPU_ threads.
//!
//! Common usage would be:
//!
//! OCL::Lock lock;
//!
//! ....
//!
//! // Critical section begins
//!
//! lock.lock();
//!
//! .....
//!
//! // Critical section ends
//!
//! lock.unlock();
//!
class Lock {
public:
//! Constructor for OCLLock
Lock();
//! Destructor for OCLLock
~Lock();
//! Try to acquire the lock, if available continue, else wait on the lock
void lock();
//! Try to acquire the lock, if available, hold it, else continue doing
//! something else
bool tryLock();
//! Unlock the lock and return
void unlock();
private:
/////////////////////////////////////////////////////////////
//!
//! Private data members and methods
//!
//! System specific synchronization primitive
#ifdef ATI_OS_WIN
CRITICAL_SECTION _cs;
#else
pthread_mutex_t _lock;
#endif
};
//////////////////////////////////////////////////////////////
//!
//! \class Thread
//! \brief Provides a wrapper for creating a _CPU_ thread.
//!
//! This class provides a simple wrapper to a CPU thread/
//! The class name might be a bit confusing, esp considering
//! the GPU has it's own threads as well.
//!
class Thread {
public:
//! Thread constructor and destructor. Note that the thread is
//! NOT created in the constructor. The thread creation takes
//! place in the create method
Thread();
~Thread();
//! Wrapper for pthread_create. Pass the thread's entry
//! point and data to be passed to the routine
bool create(oclThreadFunc func, void *arg);
//! Wrapper for pthread_join. The calling thread
//! will wait until _this_ thread exits
bool join();
//! Get the thread data passed by the application
void *getData() { return _data; }
//! Get the thread ID
static unsigned int getID();
private:
/////////////////////////////////////////////////////////////
//!
//! Private data members and methods
//!
#ifdef ATI_OS_WIN
//! store the handle
HANDLE _tid;
unsigned int _ID;
#else
pthread_t _tid;
pthread_attr_t _attr;
#endif
void *_data;
};
}; // namespace OCLutil
#endif
@@ -0,0 +1,47 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef OCLLOG_H_
#define OCLLOG_H_
#ifdef ATI_OS_WIN
#ifdef OCLTST_LOG_BUILD
#define DLLIMPORT __declspec(dllexport)
#else
#define DLLIMPORT __declspec(dllimport)
#endif // OCLTST_ENV_BUILD
#else
#define DLLIMPORT
#endif // ATI_OS_WIN
enum oclLoggingLevel {
OCLTEST_LOG_ALWAYS,
OCLTEST_LOG_VERBOSE,
};
extern DLLIMPORT void oclTestLog(oclLoggingLevel logLevel, const char* fmt,
...);
extern DLLIMPORT void oclTestSetLogLevel(int level);
extern DLLIMPORT void oclTestEnableLogToFile(const char* filename);
#endif // OCLLOG_H_
@@ -0,0 +1,73 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCLTEST_H_
#define _OCLTEST_H_
#include <string>
#include "OCLWrapper.h"
class BaseTestImp;
class OCLTestImp;
class OCLTest {
public:
virtual unsigned int getThreadUsage(void) = 0;
virtual int getNumSubTests(void) = 0;
virtual void open() = 0;
virtual void open(unsigned int test, const char* deviceName,
unsigned int architecture) = 0;
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceId, unsigned int platformIndex) = 0;
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceId) = 0;
virtual void run(void) = 0;
virtual unsigned int close(void) = 0;
virtual void setErrorMsg(const char* error) = 0;
virtual const char* getErrorMsg(void) = 0;
virtual bool hasErrorOccured(void) = 0;
virtual void clearError() = 0;
virtual void setDeviceId(unsigned int deviceId) = 0;
virtual void setPlatformIndex(unsigned int platformIndex) = 0;
virtual OCLTestImp* toOCLTestImp() = 0;
virtual BaseTestImp* toBaseTestImp() = 0;
virtual float getPerfInfo() = 0;
virtual void clearPerfInfo(void) = 0;
virtual void setIterationCount(int cnt) = 0;
virtual void useCPU() = 0;
// Having this return true will allow the creation of the
// test to be cached in between runs and will only be
// deleted after all the tests are finished running.
// This defaults to false as not many tests are modified
// to use it.
// FIXME: Switch all tests to support caching.
virtual bool cache_test() { return true; }
std::string testDescString;
void resetDescString(void) { testDescString.clear(); }
virtual ~OCLTest(){};
};
#endif // _OCLTEST_H_
@@ -0,0 +1,43 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCLMODULE_H_
#define _OCLMODULE_H_
#ifdef ATI_OS_WIN
#define OCLLCONV __cdecl
#endif
#ifdef ATI_OS_LINUX
#define OCLLCONV
#endif
class OCLTest;
//
// exported function pointer typedefs
//
typedef unsigned int(OCLLCONV *TestCountFuncPtr)(void);
typedef const char *(OCLLCONV *TestNameFuncPtr)(unsigned int);
typedef OCLTest *(OCLLCONV *CreateTestFuncPtr)(unsigned int);
typedef void(OCLLCONV *DestroyTestFuncPtr)(OCLTest *);
typedef unsigned int(OCLLCONV *TestVersionFuncPtr)(void);
typedef const char *(OCLLCONV *TestLibNameFuncPtr)(void);
#endif // _OCLMODULE_H_
@@ -0,0 +1,31 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef OCLTESTUTILS_H_
#define OCLTESTUTILS_H_
#include <string>
// @param FN Name of the file to be loaded
// @param S String to store the loaded file
// @brief Load file to a string
// @return true on success
bool loadFile(const char* FN, std::string& S);
#endif /* OCLTESTUTILS_H_ */
@@ -0,0 +1,614 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef __OCLWrapper_H
#define __OCLWrapper_H
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#include "CL/cl.h"
#include "CL/cl_ext.h"
#include "CL/cl_gl.h"
#include "cl_profile_amd.h"
typedef CL_API_ENTRY cl_int(CL_API_CALL *clUnloadPlatformAMD_fn)(
cl_platform_id id);
// Function Pointer Declarations for cl_khr_gl_sharing extension (missing in
// cl_gl.h)
typedef CL_API_ENTRY cl_int(CL_API_CALL *clGetGLContextInfoKHR_fn)(
const cl_context_properties *properties, cl_gl_context_info param_name,
size_t param_value_size, void *param_value, size_t *param_value_size_ret);
typedef CL_API_ENTRY cl_mem(CL_API_CALL *clCreateFromGLBuffer_fn)(
cl_context context, cl_mem_flags flags, unsigned int bufobj,
int *errcode_ret);
typedef CL_API_ENTRY cl_mem(CL_API_CALL *clCreateFromGLTexture_fn)(
cl_context context, cl_mem_flags flags, unsigned int texture_target,
int miplevel, unsigned int texture, cl_int *errcode_ret);
typedef CL_API_ENTRY cl_mem(CL_API_CALL *clCreateFromGLTexture2D_fn)(
cl_context context, cl_mem_flags flags, unsigned int texture_target,
int miplevel, unsigned int texture, cl_int *errcode_ret);
typedef CL_API_ENTRY cl_mem(CL_API_CALL *clCreateFromGLRenderbuffer_fn)(
cl_context context, cl_mem_flags flags, unsigned int renderbuffer,
cl_int *errcode_ret);
typedef CL_API_ENTRY cl_int(CL_API_CALL *clGetGLObjectInfo_fn)(
cl_mem memobj, cl_gl_object_type *gl_object_type,
unsigned int *gl_object_name);
typedef CL_API_ENTRY cl_int(CL_API_CALL *clGetGLTextureInfo_fn)(
cl_mem memobj, cl_gl_texture_info param_name, size_t param_value_size,
void *param_value, size_t *param_value_size_ret);
typedef CL_API_ENTRY cl_int(CL_API_CALL *clEnqueueAcquireGLObjects_fn)(
cl_command_queue command_queue, cl_uint num_objects,
const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *event);
typedef CL_API_ENTRY cl_int(CL_API_CALL *clEnqueueReleaseGLObjects_fn)(
cl_command_queue command_queue, cl_uint num_objects,
const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *event);
// Function Pointer Declarations for performance counters
typedef CL_API_ENTRY cl_perfcounter_amd(CL_API_CALL *clCreatePerfCounterAMD_fn)(
cl_device_id device, cl_perfcounter_property *properties,
cl_int *errcode_ret);
typedef CL_API_ENTRY cl_int(CL_API_CALL *clEnqueueBeginPerfCounterAMD_fn)(
cl_command_queue command_queue, cl_uint num_perf_counters,
cl_perfcounter_amd *perf_counters, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *event);
typedef CL_API_ENTRY cl_int(CL_API_CALL *clEnqueueEndPerfCounterAMD_fn)(
cl_command_queue command_queue, cl_uint num_perf_counters,
cl_perfcounter_amd *perf_counters, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *event);
typedef CL_API_ENTRY cl_int(CL_API_CALL *clGetPerfCounterInfoAMD_fn)(
cl_perfcounter_amd perf_counter, cl_perfcounter_info param_name,
size_t param_value_size, void *param_value, size_t *param_value_size_ret);
typedef CL_API_ENTRY cl_int(CL_API_CALL *clReleasePerfCounterAMD_fn)(
cl_perfcounter_amd perf_counter);
typedef CL_API_ENTRY cl_int(CL_API_CALL *clRetainPerfCounterAMD_fn)(
cl_perfcounter_amd perf_counter);
typedef CL_API_ENTRY cl_int(CL_API_CALL *clSetDeviceClockModeAMD_fn)(
cl_device_id device,
cl_set_device_clock_mode_input_amd set_clock_mode_input,
cl_set_device_clock_mode_output_amd *set_clock_mode_Output);
class OCLWrapper {
public:
OCLWrapper();
~OCLWrapper() {}
// All OCL APIs are declared in the order they appear in cl.h
cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms,
cl_uint *num_platforms);
cl_int clGetPlatformInfo(cl_platform_id platform, cl_platform_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret);
cl_int clGetDeviceIDs(cl_platform_id platform, cl_device_type device_type,
cl_uint num_entries, cl_device_id *devices,
cl_uint *num_devices);
cl_int clGetDeviceInfo(cl_device_id device, cl_device_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret);
cl_context clCreateContext(cl_context_properties *properties,
cl_uint num_devices, const cl_device_id *devices,
void(CL_CALLBACK *pfn_notify)(const char *,
const void *, size_t,
void *),
void *user_data, cl_int *errcode_ret);
cl_context clCreateContextFromType(
cl_context_properties *properties, cl_device_type device_type,
void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
void *user_data, cl_int *errcode_ret);
cl_int clRetainContext(cl_context context);
cl_int clReleaseContext(cl_context context);
cl_int clGetContextInfo(cl_context context, cl_context_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret);
cl_command_queue clCreateCommandQueue(cl_context context, cl_device_id device,
cl_command_queue_properties properties,
cl_int *errcode_ret);
cl_int clRetainCommandQueue(cl_command_queue command_queue);
cl_int clReleaseCommandQueue(cl_command_queue command_queue);
cl_int clGetCommandQueueInfo(cl_command_queue command_queue,
cl_command_queue_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret);
cl_mem clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size,
void *host_ptr, cl_int *errcode_ret);
cl_mem clCreateImage2D(cl_context context, cl_mem_flags flags,
const cl_image_format *image_format,
size_t image_width, size_t image_height,
size_t image_row_pitch, void *host_ptr,
cl_int *errcode_ret);
cl_mem clCreateImage3D(cl_context context, cl_mem_flags flags,
const cl_image_format *image_format,
size_t image_width, size_t image_height,
size_t image_depth, size_t image_row_pitch,
size_t image_slice_pitch, void *host_ptr,
cl_int *errcode_ret);
cl_int clRetainMemObject(cl_mem memobj);
cl_int clReleaseMemObject(cl_mem memobj);
cl_int clGetSupportedImageFormats(cl_context context, cl_mem_flags flags,
cl_mem_object_type image_type,
cl_uint num_entries,
cl_image_format *image_formats,
cl_uint *num_image_formats);
cl_int clGetMemObjectInfo(cl_mem memobj, cl_mem_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret);
cl_int clGetImageInfo(cl_mem image, cl_image_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret);
cl_sampler clCreateSampler(cl_context context, cl_bool normalized_coords,
cl_addressing_mode addressing_mode,
cl_filter_mode filter_mode, cl_int *errcode_ret);
cl_int clRetainSampler(cl_sampler sampler);
cl_int clReleaseSampler(cl_sampler sampler);
cl_int clGetSamplerInfo(cl_sampler sampler, cl_sampler_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret);
cl_program clCreateProgramWithSource(cl_context context, cl_uint count,
const char **strings,
const size_t *lengths,
cl_int *errcode_ret);
cl_program clCreateProgramWithBinary(cl_context context, cl_uint num_devices,
const cl_device_id *device_list,
const size_t *lengths,
const unsigned char **binaries,
cl_int *binary_status,
cl_int *errcode_ret);
cl_int clRetainProgram(cl_program program);
cl_int clReleaseProgram(cl_program program);
cl_int clBuildProgram(cl_program program, cl_uint num_devices,
const cl_device_id *device_list, const char *options,
void(CL_CALLBACK *pfn_notify)(cl_program program,
void *user_data),
void *user_data);
cl_int clCompileProgram(
cl_program program, cl_uint num_devices, const cl_device_id *device_list,
const char *options, cl_uint num_input_headers,
const cl_program *input_headers, const char **header_include_names,
void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
void *user_data);
cl_program clLinkProgram(cl_context context, cl_uint num_devices,
const cl_device_id *device_list, const char *options,
cl_uint num_input_programs,
const cl_program *input_programs,
void(CL_CALLBACK *pfn_notify)(cl_program program,
void *user_data),
void *user_data, cl_int *errcode_ret);
cl_int clUnloadCompiler(void);
cl_int clUnloadPlatform(cl_platform_id);
cl_int clGetProgramInfo(cl_program program, cl_program_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret);
cl_int clGetProgramBuildInfo(cl_program program, cl_device_id device,
cl_program_build_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret);
cl_kernel clCreateKernel(cl_program program, const char *kernel_name,
cl_int *errcode_ret);
cl_int clCreateKernelsInProgram(cl_program program, cl_uint num_kernels,
cl_kernel *kernels, cl_uint *num_kernels_ret);
cl_int clRetainKernel(cl_kernel kernel);
cl_int clReleaseKernel(cl_kernel kernel);
cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size,
const void *arg_value);
cl_int clGetKernelInfo(cl_kernel kernel, cl_kernel_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret);
cl_int clGetKernelWorkGroupInfo(cl_kernel kernel, cl_device_id device,
cl_kernel_work_group_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret);
cl_int clWaitForEvents(cl_uint num_events, const cl_event *event_list);
cl_int clGetEventInfo(cl_event evnt, cl_event_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret);
cl_int clRetainEvent(cl_event evnt);
cl_int clReleaseEvent(cl_event evnt);
cl_int clGetEventProfilingInfo(cl_event evnt, cl_profiling_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret);
cl_int clFlush(cl_command_queue command_queue);
cl_int clFinish(cl_command_queue command_queue);
cl_int clEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer,
cl_bool blocking_read, size_t offset, size_t cb,
void *ptr, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt);
cl_int clEnqueueWriteBuffer(cl_command_queue command_queue, cl_mem buffer,
cl_bool blocking_write, size_t offset, size_t cb,
const void *ptr, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt);
cl_int clEnqueueCopyBuffer(cl_command_queue command_queue, cl_mem src_buffer,
cl_mem dst_buffer, size_t src_offset,
size_t dst_offset, size_t cb,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt);
cl_int clEnqueueReadImage(cl_command_queue command_queue, cl_mem image,
cl_bool blocking_read, const size_t *origin,
const size_t *region, size_t row_pitch,
size_t slice_pitch, void *ptr,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt);
cl_int clEnqueueWriteImage(cl_command_queue command_queue, cl_mem image,
cl_bool blocking_write, const size_t *origin,
const size_t *region, size_t input_row_pitch,
size_t input_slice_pitch, const void *ptr,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt);
cl_int clEnqueueCopyImage(cl_command_queue command_queue, cl_mem src_image,
cl_mem dst_image, const size_t *src_origin,
const size_t *dst_origin, const size_t *region,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt);
cl_int clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
cl_mem src_image, cl_mem dst_buffer,
const size_t *src_origin,
const size_t *region, size_t dst_offset,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *evnt);
cl_int clEnqueueCopyBufferToImage(cl_command_queue command_queue,
cl_mem src_buffer, cl_mem dst_image,
size_t src_offset, const size_t *dst_origin,
const size_t *region,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *evnt);
void *clEnqueueMapBuffer(cl_command_queue command_queue, cl_mem buffer,
cl_bool blocking_map, cl_map_flags map_flags,
size_t offset, size_t cb,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt,
cl_int *errcode_ret);
void *clEnqueueMapImage(cl_command_queue command_queue, cl_mem image,
cl_bool blocking_map, cl_map_flags map_flags,
const size_t *origin, const size_t *region,
size_t *image_row_pitch, size_t *image_slice_pitch,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt,
cl_int *errcode_ret);
cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue, cl_mem memobj,
void *mapped_ptr,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *evnt);
cl_int clEnqueueNDRangeKernel(
cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim,
const size_t *global_work_offset, const size_t *global_work_size,
const size_t *local_work_size, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt);
cl_int clEnqueueTask(cl_command_queue command_queue, cl_kernel kernel,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt);
cl_int clEnqueueNativeKernel(cl_command_queue command_queue,
void(CL_CALLBACK *user_func)(void *), void *args,
size_t cb_args, cl_uint num_mem_objects,
const cl_mem *mem_list,
const void **args_mem_loc,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt);
cl_int clEnqueueMarker(cl_command_queue command_queue, cl_event *evnt);
cl_int clEnqueueMarkerWithWaitList(cl_command_queue command_queue,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *evnt);
cl_int clEnqueueWaitForEvents(cl_command_queue command_queue,
cl_uint num_events, const cl_event *event_list);
cl_int clEnqueueBarrier(cl_command_queue command_queue);
void *clGetExtensionFunctionAddress(const char *func_name);
cl_int clEnqueueReadBufferRect(
cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
const size_t *buffer_origin, const size_t *host_origin,
const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
size_t host_row_pitch, size_t host_slice_pitch, void *ptr,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *evnt);
cl_int clEnqueueWriteBufferRect(
cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
const size_t *buffer_origin, const size_t *host_origin,
const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
size_t host_row_pitch, size_t host_slice_pitch, const void *ptr,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *evnt);
cl_int clEnqueueCopyBufferRect(
cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
const size_t *src_origin, const size_t *dst_origin, const size_t *region,
size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch,
size_t dst_slice_pitch, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt);
cl_mem clCreateImage(cl_context context, cl_mem_flags flags,
const cl_image_format *image_format,
const cl_image_desc *image_desc, void *host_ptr,
cl_int *errcode_ret);
cl_mem clCreateSubBuffer(cl_mem mem, cl_mem_flags flags,
cl_buffer_create_type buffer_create_type,
const void *buffer_create_info, cl_int *errcode_ret);
cl_int clSetEventCallback(
cl_event event, cl_int command_exec_callback_type,
void(CL_CALLBACK *pfn_event_notify)(cl_event event,
cl_int event_command_exec_status,
void *user_data),
void *user_data);
cl_int clEnqueueFillImage(cl_command_queue command_queue, cl_mem image,
void *ptr, const size_t *origin,
const size_t *region,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt);
cl_int clUnloadPlatformAMD(cl_platform_id id);
cl_int clEnqueueWaitSignalAMD(cl_command_queue command_queue,
cl_mem mem_object, cl_uint value,
cl_uint num_events,
const cl_event *event_wait_list,
cl_event *event);
cl_int clEnqueueWriteSignalAMD(cl_command_queue command_queue,
cl_mem mem_object, cl_uint value,
cl_ulong offset, cl_uint num_events,
const cl_event *event_list, cl_event *event);
cl_int clEnqueueMakeBuffersResidentAMD(
cl_command_queue command_queue, cl_uint num_mem_objs, cl_mem *mem_objects,
cl_bool blocking_make_resident, cl_bus_address_amd *bus_addresses,
cl_uint num_events, const cl_event *event_list, cl_event *event);
cl_int clEnqueueMigrateMemObjects(cl_command_queue command_queue,
cl_uint num_mem_objects,
const cl_mem *mem_objects,
cl_mem_migration_flags flags,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event);
// CL-GL Extension: cl_khr_gl_sharing
cl_int clGetGLContextInfoKHR(const cl_context_properties *properties,
cl_gl_context_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret);
cl_mem clCreateFromGLBuffer(cl_context context, cl_mem_flags flags,
unsigned int bufobj, int *errcode_ret);
cl_mem clCreateFromGLTexture(cl_context context, cl_mem_flags flags,
unsigned int texture_target, int miplevel,
unsigned int texture, cl_int *errcode_ret);
cl_mem clCreateFromGLTexture2D(cl_context context, cl_mem_flags flags,
unsigned int texture_target, int miplevel,
unsigned int texture, cl_int *errcode_ret);
cl_mem clCreateFromGLRenderbuffer(cl_context context, cl_mem_flags flags,
unsigned int renderbuffer,
cl_int *errcode_ret);
cl_int clGetGLObjectInfo(cl_mem memobj, cl_gl_object_type *gl_object_type,
unsigned int *gl_object_name);
cl_int clGetGLTextureInfo(cl_mem memobj, cl_gl_texture_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret);
cl_int clEnqueueAcquireGLObjects(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem *mem_objects,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event);
cl_int clEnqueueReleaseGLObjects(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem *mem_objects,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event);
#if defined(CL_VERSION_2_0)
cl_command_queue clCreateCommandQueueWithProperties(
cl_context context, cl_device_id device,
const cl_queue_properties *properties, cl_int *errcode_ret);
void *clSVMAlloc(cl_context context, cl_svm_mem_flags flags, size_t size,
cl_uint alignment);
void clSVMFree(cl_context context, void *svm_pointer);
cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map,
cl_map_flags flags, void *svm_ptr, size_t size,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *event);
cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, void *svm_ptr,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *event);
cl_int clEnqueueSVMMemFill(cl_command_queue command_queue, void *svm_ptr,
const void *pattern, size_t pattern_size,
size_t size, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *event);
cl_int clSetKernelArgSVMPointer(cl_kernel kernel, cl_uint arg_index,
const void *arg_value);
cl_mem clCreatePipe(cl_context context, cl_mem_flags flags,
cl_uint packet_size, cl_uint num_packets,
const cl_pipe_properties *properties,
cl_int *errcode_ret);
cl_int clGetPipeInfo(cl_mem pipe, cl_pipe_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret);
#endif
cl_perfcounter_amd clCreatePerfCounterAMD(cl_device_id device,
cl_perfcounter_property *properties,
cl_int *errcode_ret);
cl_int clEnqueueBeginPerfCounterAMD(cl_command_queue command_queue,
cl_uint num_perf_counters,
cl_perfcounter_amd *perf_counters,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event);
cl_int clEnqueueEndPerfCounterAMD(cl_command_queue command_queue,
cl_uint num_perf_counters,
cl_perfcounter_amd *perf_counters,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event);
cl_int clGetPerfCounterInfoAMD(cl_perfcounter_amd perf_counter,
cl_perfcounter_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret);
cl_int clReleasePerfCounterAMD(cl_perfcounter_amd perf_counter);
cl_int clRetainPerfCounterAMD(cl_perfcounter_amd perf_counter);
cl_int clSetDeviceClockModeAMD(
cl_device_id device,
cl_set_device_clock_mode_input_amd set_clock_mode_input,
cl_set_device_clock_mode_output_amd *set_clock_mode_Output);
private:
clEnqueueWaitSignalAMD_fn clEnqueueWaitSignalAMD_ptr;
clEnqueueWriteSignalAMD_fn clEnqueueWriteSignalAMD_ptr;
clEnqueueMakeBuffersResidentAMD_fn clEnqueueMakeBuffersResidentAMD_ptr;
// Unload the platform
clUnloadPlatformAMD_fn clUnloadPlatformAMD_ptr;
// CL-GL Extension: cl_khr_gl_sharing
clGetGLContextInfoKHR_fn clGetGLContextInfoKHR_ptr;
clCreateFromGLBuffer_fn clCreateFromGLBuffer_ptr;
clCreateFromGLTexture_fn clCreateFromGLTexture_ptr;
clCreateFromGLTexture2D_fn clCreateFromGLTexture2D_ptr;
clCreateFromGLRenderbuffer_fn clCreateFromGLRenderbuffer_ptr;
clGetGLObjectInfo_fn clGetGLObjectInfo_ptr;
clGetGLTextureInfo_fn clGetGLTextureInfo_ptr;
clEnqueueAcquireGLObjects_fn clEnqueueAcquireGLObjects_ptr;
clEnqueueReleaseGLObjects_fn clEnqueueReleaseGLObjects_ptr;
// Performance counters
clCreatePerfCounterAMD_fn clCreatePerfCounterAMD_ptr;
clEnqueueBeginPerfCounterAMD_fn clEnqueueBeginPerfCounterAMD_ptr;
clEnqueueEndPerfCounterAMD_fn clEnqueueEndPerfCounterAMD_ptr;
clGetPerfCounterInfoAMD_fn clGetPerfCounterInfoAMD_ptr;
clReleasePerfCounterAMD_fn clReleasePerfCounterAMD_ptr;
clRetainPerfCounterAMD_fn clRetainPerfCounterAMD_ptr;
// Set clockMode
clSetDeviceClockModeAMD_fn clSetDeviceClockModeAMD_ptr;
};
#endif
@@ -0,0 +1,104 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "oclTestLog.h"
#include <cassert>
#include <cstring>
#include "OCLLog.h"
oclLog::oclLog()
: m_stdout_fp(stdout), m_filename(""), m_writeToFileIsEnabled(false) {}
oclLog::~oclLog() { disable_write_to_file(); }
void oclLog::enable_write_to_file(std::string filename) {
m_writeToFileIsEnabled = true;
m_filename = filename;
FILE* fp = fopen(m_filename.c_str(), "w");
if (fp == NULL) {
oclTestLog(OCLTEST_LOG_ALWAYS,
"ERROR: Cannot open file %s. Disabling logging to file.\n",
filename.c_str());
m_writeToFileIsEnabled = false;
} else {
fclose(fp);
}
}
void oclLog::disable_write_to_file() { m_writeToFileIsEnabled = false; }
void oclLog::vprint(char const* fmt, va_list args) {
// hack for fixing the lnx64bit segfault and
// garbage printing in file. XXX 2048 a magic number
char buffer[4096];
memset(buffer, 0, sizeof(buffer));
int rc = vsnprintf(buffer, sizeof(buffer), fmt, args);
assert(rc >= 0 && rc != sizeof(buffer));
fputs(buffer, m_stdout_fp);
if (m_writeToFileIsEnabled) {
FILE* fp = fopen(m_filename.c_str(), "a");
if (fp == NULL) {
oclTestLog(OCLTEST_LOG_ALWAYS,
"ERROR: Cannot open file %s. Disabling logging to file.\n",
m_filename.c_str());
m_writeToFileIsEnabled = false;
}
fputs(buffer, fp);
fclose(fp);
}
}
void oclLog::flush() { fflush(m_stdout_fp); }
static oclLog& theLog() {
static oclLog Log;
return Log;
}
static oclLoggingLevel currentLevel = OCLTEST_LOG_ALWAYS;
static float logcount = 0.0f;
void oclTestLog(oclLoggingLevel logLevel, const char* fmt, ...) {
logcount += 1.0f;
if (logLevel <= currentLevel) {
va_list args;
va_start(args, fmt);
theLog().vprint(fmt, args);
theLog().flush();
va_end(args);
}
}
void oclTestEnableLogToFile(const char* filename) {
theLog().enable_write_to_file(filename);
}
void oclTestSetLogLevel(int level) {
if (level >= 0) {
currentLevel = static_cast<oclLoggingLevel>(level);
}
}
@@ -0,0 +1,44 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef CALTESTLOG_H_
#define CALTESTLOG_H_
#include <stdarg.h>
#include <stdio.h>
#include <string>
class oclLog {
public:
oclLog();
virtual ~oclLog();
virtual void vprint(char const* fmt, va_list args);
virtual void flush();
virtual void enable_write_to_file(std::string filename);
virtual void disable_write_to_file();
private:
FILE* m_stdout_fp;
std::string m_filename;
bool m_writeToFileIsEnabled;
};
#endif // CALTESTLOG_H_
@@ -0,0 +1,185 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "BaseTestImp.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <cstring>
/////////////////////////////////////////////////////////////////////////////
static unsigned int crcinit(unsigned int crc);
static int initializeSeed(void);
/////////////////////////////////////////////////////////////////////////////
BaseTestImp::BaseTestImp()
: _numSubTests(0), _openTest(0), _deviceName(NULL), _architecture(0) {
_cpu = false;
unsigned int i;
for (i = 0; i < 256; i++) {
_crctab[i] = crcinit(i << 24);
}
_crcword = ~0;
_deviceId = 0;
_platformIndex = 0;
_perfInfo = 0.0f;
#ifdef ATI_OS_LINUX //
_useThreads = 0; // disable threads on linux
#else
_useThreads = 1; // if available on platform
#endif
clearError();
}
void BaseTestImp::checkComplib(unsigned int test, const char *deviceName,
unsigned int architecture) {
BaseTestImp::open();
devices_ = 0;
deviceCount_ = 0;
context_ = 0;
program_ = 0;
kernel_ = 0;
type_ = CL_DEVICE_TYPE_GPU;
cl_uint numPlatforms = 0;
error_ = clGetPlatformIDs(0, NULL, &numPlatforms);
CHECK_RESULT((error_ != CL_SUCCESS), "clGetPlatformIDs failed");
CHECK_RESULT((numPlatforms == 0), "No platform found");
cl_platform_id *platforms = new cl_platform_id[numPlatforms];
error_ = clGetPlatformIDs(numPlatforms, platforms, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
cl_platform_id platform = 0;
#if 0
for(unsigned int i = 0; i < numPlatforms; ++i)
{
char buff[200];
error_ = clGetPlatformInfo(platforms[i],CL_PLATFORM_VENDOR, sizeof(buff), buff, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
if(strcmp(buff, "Advanced Micro Devices, Inc.") == 0)
{
platform = platforms[i];
break;
}
}
#endif
platform = platforms[_platformIndex];
delete[] platforms;
CHECK_RESULT((platform == 0), "AMD Platform not found");
error_ = clGetDeviceIDs(platform, type_, 0, NULL, &deviceCount_);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed");
devices_ = new cl_device_id[deviceCount_];
error_ = clGetDeviceIDs(platform, type_, deviceCount_, devices_, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed");
char device_string[200];
clGetDeviceInfo(devices_[_deviceId], CL_DRIVER_VERSION, sizeof(device_string),
&device_string, NULL);
if (strstr(device_string, "LC")) {
printf("Skipping test since it does not run with LC\n");
failed_ = true;
return;
}
return;
}
BaseTestImp::~BaseTestImp() {}
void BaseTestImp::open() {
_crcword = 0;
clearError();
}
void BaseTestImp::open(unsigned int test, const char *deviceName,
unsigned int architecture) {
open();
}
unsigned int BaseTestImp::close() { return _crcword; }
unsigned int BaseTestImp::getThreadUsage(void) { return _useThreads; }
int BaseTestImp::getNumSubTests(void) { return _numSubTests; }
void BaseTestImp::setDeviceName(const char *name) { _deviceName = name; }
const char *BaseTestImp::getDeviceName() { return _deviceName; }
float BaseTestImp::getPerfInfo(void) { return _perfInfo; }
void BaseTestImp::clearPerfInfo(void) { _perfInfo = 0.0; }
void BaseTestImp::setDeviceId(unsigned int deviceId) { _deviceId = deviceId; }
void BaseTestImp::setIterationCount(int cnt) { _iterationCnt = cnt; }
unsigned int BaseTestImp::getDeviceId() { return _deviceId; }
void BaseTestImp::setPlatformIndex(unsigned int platformIndex) {
_platformIndex = platformIndex;
}
unsigned int BaseTestImp::getPlatformIndex() { return _platformIndex; }
void BaseTestImp::setErrorMsg(const char *error) {
_errorFlag = true;
_errorMsg.assign((const char *)error);
}
const char *BaseTestImp::getErrorMsg() { return _errorMsg.c_str(); }
bool BaseTestImp::hasErrorOccured() { return _errorFlag; }
void BaseTestImp::clearError() {
_errorFlag = false;
_errorMsg.clear();
}
/////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
//
// Same CRC32 as used by ogtst
//
static const unsigned int CRCMASK = 0x04c11db7;
static unsigned int crcinit(unsigned int crc) {
int i;
unsigned int ans = crc;
for (i = 0; i < 8; i++) {
if (ans & 0x80000000) {
ans = (ans << 1) ^ CRCMASK;
} else {
ans <<= 1;
}
}
return (ans);
}
@@ -0,0 +1,175 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLGLCommon.h"
#include <cmath>
#include <cstring>
void OCLGLCommon::open(unsigned int test, char *units, double &conversion,
unsigned int deviceId) {
// OpenCL Initialization
OCLTestImp::open(test, units, conversion, deviceId);
CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test (%d)", error_);
char name[1024] = {0};
size_t size = 0;
if (deviceId >= deviceCount_) {
_errorFlag = true;
return;
}
// Check that the device supports CL/GL interop extension
_wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 1024,
name, &size);
if (!strstr(name, "cl_khr_gl_sharing")) {
printf("KHR GL sharing extension is required for this test!\n");
_errorFlag = true;
return;
}
// OpenGL Initialization
bool retVal = initializeGLContext(hGL_);
CHECK_RESULT((retVal == CL_SUCCESS), "Error opening test (%d)", error_);
createCLContextFromGLContext(hGL_);
}
bool OCLGLCommon::IsGLEnabled(unsigned int test, char *units,
double &conversion, unsigned int deviceId) {
OCLTestImp::open(test, units, conversion, deviceId);
bool bResult = initializeGLContext(hGL_);
if (bResult) {
deleteGLContext(hGL_);
}
OCLTestImp::close();
return bResult;
}
void OCLGLCommon::gluPerspective(double fovy, double aspect, double zNear,
double zFar) {
double xmin, xmax, ymin, ymax;
ymax = zNear * tan(fovy * 3.149 / 360.0);
ymin = -ymax;
xmin = ymin * aspect;
xmax = ymax * aspect;
glFrustum(xmin, xmax, ymin, ymax, zNear, zFar);
}
unsigned int OCLGLCommon::close(void) {
makeCurrent(hGL_);
unsigned int retVal = OCLTestImp::close();
deleteGLContext(hGL_);
return retVal;
}
void OCLGLCommon::dumpBuffer(float *pBuffer, const char fileName[],
unsigned int dimSize) {
if (pBuffer) {
FILE *f = fopen(fileName, "w");
if (NULL != f) {
unsigned int i, j;
for (i = 0; i < dimSize; i++) {
for (j = 0; j < dimSize; j++) {
fprintf(f, "%e,\t", pBuffer[i * (dimSize) + j]);
}
fprintf(f, "\n");
}
fclose(f);
}
}
}
bool OCLGLCommon::createGLFragmentProgramFromSource(const char *source,
GLuint &shader,
GLuint &program) {
shader = glCreateShader(GL_FRAGMENT_SHADER);
glShaderSource(shader, 1, &source, NULL);
glCompileShader(shader);
printShaderInfoLog(shader);
program = glCreateProgram();
glAttachShader(program, shader);
glLinkProgram(program);
printProgramInfoLog(program);
return program != 0;
}
int OCLGLCommon::printOglError(char *file, int line) {
//
// Returns 1 if an OpenGL error occurred, 0 otherwise.
//
GLenum glErr;
int retCode = 0;
glErr = glGetError();
if (glErr != GL_NO_ERROR) {
printf("glError in file %s @ line %d: %d\n", file, line, glErr);
retCode = 1;
}
return retCode;
}
//
// Print out the information log for a shader object
//
void OCLGLCommon::printShaderInfoLog(GLuint shader) {
int infologLength = 0;
int charsWritten = 0;
GLchar *infoLog;
glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &infologLength);
if (infologLength > 0) {
infoLog = (GLchar *)malloc(infologLength);
if (infoLog == NULL) {
printf("ERROR: Could not allocate InfoLog buffer\n");
return;
}
glGetShaderInfoLog(shader, infologLength, &charsWritten, infoLog);
printf("Shader InfoLog:\n%s\n\n", infoLog);
free(infoLog);
}
}
void OCLGLCommon::printProgramInfoLog(GLuint program) {
int infologLength = 0;
int charsWritten = 0;
GLchar *infoLog;
// printOpenGLError(); // Check for OpenGL errors
glGetProgramiv(program, GL_INFO_LOG_LENGTH, &infologLength);
// printOpenGLError(); // Check for OpenGL errors
if (infologLength > 0) {
infoLog = (GLchar *)malloc(infologLength);
if (infoLog == NULL) {
printf("ERROR: Could not allocate InfoLog buffer\n");
exit(1);
}
glGetProgramInfoLog(program, infologLength, &charsWritten, infoLog);
printf("Program InfoLog:\n%s\n\n", infoLog);
free(infoLog);
}
// printOpenGLError(); // Check for OpenGL errors
}
@@ -0,0 +1,80 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_GL_COMMON_H_
#define _OCL_GL_COMMON_H_
#include <GL/glew.h>
#include <GL/gl.h>
#include <GL/glx.h>
#include <CL/cl.h>
#include <CL/cl_gl.h>
#include "OCLTestImp.h"
typedef struct OCLGLHandle_* OCLGLHandle;
#define printOpenGLError() OCLGLCommon::printOglError(__FILE__, __LINE__)
class OCLGLCommon : public OCLTestImp {
public:
/////////////////////////////////////////
// private initialization and clean-up //
/////////////////////////////////////////
OCLGLCommon();
virtual ~OCLGLCommon();
///////////////////////
// virtual interface //
///////////////////////
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceId);
virtual unsigned int close(void);
static void gluPerspective(double fovy, double aspect, double zNear,
double zFar);
static void dumpBuffer(float* pBuffer, const char fileName[],
unsigned int dimSize);
static int printOglError(char* file, int line);
static bool createGLFragmentProgramFromSource(const char* source,
GLuint& shader,
GLuint& program);
static void printShaderInfoLog(GLuint shader);
static void printProgramInfoLog(GLuint program);
protected:
const OCLGLHandle getGLHandle() { return hGL_; }
void makeCurrent(const OCLGLHandle hGL);
void getCLContextPropertiesFromGLContext(const OCLGLHandle hGL,
cl_context_properties properties[7]);
bool createGLContext(OCLGLHandle& hGL);
void destroyGLContext(OCLGLHandle& hGL);
bool IsGLEnabled(unsigned int test, char* units, double& conversion,
unsigned int deviceId);
private:
bool initializeGLContext(OCLGLHandle& hGL);
void deleteGLContext(OCLGLHandle& hGL);
bool checkAssociationDeviceWithGLContext(OCLGLHandle& hGL);
void createCLContextFromGLContext(OCLGLHandle& hGL);
OCLGLHandle hGL_;
};
#endif // _OCL_GL_COMMON_H_
@@ -0,0 +1,239 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLGLCommon.h"
struct OCLGLHandle_ {
static Display* display;
static XVisualInfo* vInfo;
static int referenceCount;
GLXContext context;
Window window;
Colormap cmap;
};
Display* OCLGLHandle_::display = NULL;
XVisualInfo* OCLGLHandle_::vInfo = NULL;
int OCLGLHandle_::referenceCount = 0;
OCLGLCommon::OCLGLCommon() {
hGL_ = new OCLGLHandle_;
hGL_->context = NULL;
hGL_->window = 0;
hGL_->cmap = 0;
}
OCLGLCommon::~OCLGLCommon() { destroyGLContext(hGL_); }
void OCLGLCommon::destroyGLContext(OCLGLHandle& hGL) {
deleteGLContext(hGL);
delete hGL;
hGL = NULL;
}
void OCLGLCommon::deleteGLContext(OCLGLHandle& hGL) {
if (hGL->display != NULL) {
glXMakeCurrent(hGL->display, None, NULL);
if (hGL->cmap) {
XFreeColormap(hGL->display, hGL->cmap);
hGL->cmap = 0;
}
if (hGL->window) {
XDestroyWindow(hGL->display, hGL->window);
hGL->window = 0;
}
if (hGL->context) {
glXDestroyContext(hGL->display, hGL->context);
hGL->context = NULL;
}
hGL->referenceCount--;
if (hGL->referenceCount == 0) {
XCloseDisplay(hGL->display);
hGL->display = NULL;
XFree(hGL->vInfo);
hGL->vInfo = NULL;
}
}
}
bool OCLGLCommon::createGLContext(OCLGLHandle& hGL) {
hGL = new OCLGLHandle_;
return initializeGLContext(hGL);
}
bool OCLGLCommon::initializeGLContext(OCLGLHandle& hGL) {
if (hGL->display == NULL) {
hGL->display = XOpenDisplay(NULL);
if (hGL->display == NULL) {
printf("XOpenDisplay() failed\n");
return false;
}
}
if (hGL->vInfo == NULL) {
int dblBuf[] = {GLX_RGBA, GLX_RED_SIZE, 1, GLX_GREEN_SIZE,
1, GLX_BLUE_SIZE, 1, GLX_DEPTH_SIZE,
12, GLX_DOUBLEBUFFER, None};
hGL->vInfo =
glXChooseVisual(hGL->display, DefaultScreen(hGL->display), dblBuf);
if (hGL->vInfo == NULL) {
printf("glXChooseVisual() failed\n");
return false;
}
}
hGL->referenceCount++;
hGL->context = glXCreateContext(hGL->display, hGL->vInfo, None, True);
if (hGL->context == NULL) {
printf("glXCreateContext() failed\n");
return false;
}
XSetWindowAttributes swa = {0};
hGL->cmap = XCreateColormap(hGL->display,
RootWindow(hGL->display, hGL->vInfo->screen),
hGL->vInfo->visual, AllocNone);
swa.colormap = hGL->cmap;
hGL->window = XCreateWindow(
hGL->display, RootWindow(hGL->display, hGL->vInfo->screen), 0, 0, 640,
480, 0, hGL->vInfo->depth, InputOutput, hGL->vInfo->visual,
CWBorderPixel | CWColormap | CWEventMask, &swa);
Bool glErr = glXMakeCurrent(hGL->display, hGL->window, hGL->context);
if (False == glErr) {
return false;
}
if (!checkAssociationDeviceWithGLContext(hGL)) {
deleteGLContext(hGL);
return false;
}
return true;
}
bool OCLGLCommon::checkAssociationDeviceWithGLContext(OCLGLHandle& hGL) {
bool ret = false;
size_t devicesSize = 0;
cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
(cl_context_properties)platform_,
CL_GL_CONTEXT_KHR,
(cl_context_properties)hGL->context,
CL_GLX_DISPLAY_KHR,
(cl_context_properties)hGL->display,
0};
error_ = _wrapper->clGetGLContextInfoKHR(
properties, CL_DEVICES_FOR_GL_CONTEXT_KHR, 0, NULL, &devicesSize);
if (error_ != CL_SUCCESS) {
printf("clGetGLContextInfoKHR failed (%d)\n", error_);
return false;
}
cl_uint numDevices = (cl_uint)devicesSize / sizeof(cl_device_id);
cl_device_id* interopDevices = (cl_device_id*)malloc(devicesSize);
error_ =
_wrapper->clGetGLContextInfoKHR(properties, CL_DEVICES_FOR_GL_CONTEXT_KHR,
devicesSize, interopDevices, NULL);
if (error_ != CL_SUCCESS) {
printf("clGetGLContextInfoKHR failed (%d)\n", error_);
free(interopDevices);
return false;
}
// Check that current device can be associated with OpenGL context
for (unsigned int i = 0; i < numDevices; i++) {
if (interopDevices[i] == devices_[_deviceId]) {
ret = true;
break;
}
}
free(interopDevices);
return ret;
}
void OCLGLCommon::createCLContextFromGLContext(OCLGLHandle& hGL) {
cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
(cl_context_properties)platform_,
CL_GL_CONTEXT_KHR,
(cl_context_properties)hGL->context,
CL_GLX_DISPLAY_KHR,
(cl_context_properties)hGL->display,
0};
// Release current command queue
if (cmdQueues_[_deviceId]) {
error_ = _wrapper->clReleaseCommandQueue(cmdQueues_[_deviceId]);
CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
"clReleaseCommandQueue() failed");
}
// Release current context
if (context_) {
error_ = _wrapper->clReleaseContext(context_);
CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseContext() failed");
}
// Create new CL context from GL context
context_ =
clCreateContext(properties, 1, &devices_[_deviceId], NULL, NULL, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext() failed (%d)", error_);
// Create command queue for new context
cmdQueues_[_deviceId] =
_wrapper->clCreateCommandQueue(context_, devices_[_deviceId], 0, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed (%d)",
error_);
// GLEW versions 1.13.0 and earlier do not fetch all GL function pointers
// without glewExperimental set.
glewExperimental = GL_TRUE;
GLenum glErr = glewInit();
CHECK_RESULT((glErr != GLEW_OK), "glewInit() failed: %s",
glewGetErrorString(glErr));
}
void OCLGLCommon::makeCurrent(OCLGLHandle hGL) {
if (hGL == NULL) {
if (hGL_ != NULL) {
glXMakeCurrent(hGL_->display, None, NULL);
}
} else {
bool ret = glXMakeCurrent(hGL->display, hGL->window, hGL->context);
assert(ret && "glXMakeCurrent failed!");
}
}
void OCLGLCommon::getCLContextPropertiesFromGLContext(
const OCLGLHandle hGL, cl_context_properties properties[7]) {
if (!properties) return;
properties[0] = CL_CONTEXT_PLATFORM;
properties[1] = (cl_context_properties)platform_;
properties[2] = CL_GL_CONTEXT_KHR;
properties[3] = (cl_context_properties)hGL->context;
properties[4] = CL_GLX_DISPLAY_KHR;
properties[5] = (cl_context_properties)hGL->display;
properties[6] = 0;
}
@@ -0,0 +1,239 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLGLCommon.h"
struct OCLGLHandle_ {
HDC hdc;
HGLRC hglrc;
};
OCLGLCommon::OCLGLCommon() {
hGL_ = new OCLGLHandle_;
hGL_->hdc = NULL;
hGL_->hglrc = NULL;
}
OCLGLCommon::~OCLGLCommon() { destroyGLContext(hGL_); }
void OCLGLCommon::destroyGLContext(OCLGLHandle& hGL) {
deleteGLContext(hGL);
delete hGL;
hGL = NULL;
}
void OCLGLCommon::deleteGLContext(OCLGLHandle& hGL) {
wglMakeCurrent(NULL, NULL);
if (hGL->hglrc) {
wglDeleteContext(hGL->hglrc);
hGL->hglrc = NULL;
}
if (hGL->hdc) {
DeleteDC(hGL->hdc);
hGL->hdc = NULL;
}
}
bool OCLGLCommon::createGLContext(OCLGLHandle& hGL) {
hGL = new OCLGLHandle_;
return initializeGLContext(hGL);
}
bool OCLGLCommon::initializeGLContext(OCLGLHandle& hGL) {
BOOL glErr = FALSE;
DISPLAY_DEVICE dispDevice;
DWORD deviceNum;
int pfmt;
PIXELFORMATDESCRIPTOR pfd;
pfd.nSize = sizeof(PIXELFORMATDESCRIPTOR);
pfd.nVersion = 1;
pfd.dwFlags = PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER;
pfd.iPixelType = PFD_TYPE_RGBA;
pfd.cColorBits = 24;
pfd.cRedBits = 8;
pfd.cRedShift = 0;
pfd.cGreenBits = 8;
pfd.cGreenShift = 0;
pfd.cBlueBits = 8;
pfd.cBlueShift = 0;
pfd.cAlphaBits = 8;
pfd.cAlphaShift = 0;
pfd.cAccumBits = 0;
pfd.cAccumRedBits = 0;
pfd.cAccumGreenBits = 0;
pfd.cAccumBlueBits = 0;
pfd.cAccumAlphaBits = 0;
pfd.cDepthBits = 24;
pfd.cStencilBits = 8;
pfd.cAuxBuffers = 0;
pfd.iLayerType = PFD_MAIN_PLANE;
pfd.bReserved = 0;
pfd.dwLayerMask = 0;
pfd.dwVisibleMask = 0;
pfd.dwDamageMask = 0;
dispDevice.cb = sizeof(DISPLAY_DEVICE);
for (deviceNum = 0; EnumDisplayDevices(NULL, deviceNum, &dispDevice, 0);
deviceNum++) {
if (dispDevice.StateFlags & DISPLAY_DEVICE_MIRRORING_DRIVER) {
continue;
}
hGL->hdc = CreateDC(NULL, dispDevice.DeviceName, NULL, NULL);
if (!hGL->hdc) {
continue;
}
pfmt = ChoosePixelFormat(hGL->hdc, &pfd);
if (pfmt == 0) {
printf("Failed choosing the requested PixelFormat.\n");
return false;
}
glErr = SetPixelFormat(hGL->hdc, pfmt, &pfd);
if (glErr == FALSE) {
printf("Failed to set the requested PixelFormat.\n");
return false;
}
hGL->hglrc = wglCreateContext(hGL->hdc);
if (NULL == hGL->hglrc) {
printf("wglCreateContext() failed\n");
return false;
}
glErr = wglMakeCurrent(hGL->hdc, hGL->hglrc);
if (FALSE == glErr) {
printf("wglMakeCurrent() failed\n");
return false;
}
if (!checkAssociationDeviceWithGLContext(hGL)) {
deleteGLContext(hGL);
return false;
}
return true;
} // for (deviceNum = 0; EnumDisplayDevices(NULL, deviceNum, &dispDevice,
// 0); deviceNum++) {
return false;
}
bool OCLGLCommon::checkAssociationDeviceWithGLContext(OCLGLHandle& hGL) {
bool ret = false;
size_t devicesSize = 0;
cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
(cl_context_properties)platform_,
CL_GL_CONTEXT_KHR,
(cl_context_properties)hGL->hglrc,
CL_WGL_HDC_KHR,
(cl_context_properties)hGL->hdc,
0};
error_ = _wrapper->clGetGLContextInfoKHR(
properties, CL_DEVICES_FOR_GL_CONTEXT_KHR, 0, NULL, &devicesSize);
if (error_ != CL_SUCCESS) {
printf("clGetGLContextInfoKHR failed (%d)\n", error_);
return false;
}
cl_uint numDevices = (cl_uint)devicesSize / sizeof(cl_device_id);
cl_device_id* interopDevices = (cl_device_id*)malloc(devicesSize);
error_ =
_wrapper->clGetGLContextInfoKHR(properties, CL_DEVICES_FOR_GL_CONTEXT_KHR,
devicesSize, interopDevices, NULL);
if (error_ != CL_SUCCESS) {
printf("clGetGLContextInfoKHR failed (%d)\n", error_);
free(interopDevices);
return false;
}
// Check that current device can be associated with OpenGL context
for (unsigned int i = 0; i < numDevices; i++) {
if (interopDevices[i] == devices_[_deviceId]) {
ret = true;
break;
}
}
free(interopDevices);
return ret;
}
void OCLGLCommon::createCLContextFromGLContext(OCLGLHandle& hGL) {
cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
(cl_context_properties)platform_,
CL_GL_CONTEXT_KHR,
(cl_context_properties)hGL->hglrc,
CL_WGL_HDC_KHR,
(cl_context_properties)hGL->hdc,
0};
// Release current command queue
if (cmdQueues_[_deviceId]) {
error_ = _wrapper->clReleaseCommandQueue(cmdQueues_[_deviceId]);
CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
"clReleaseCommandQueue() failed");
}
// Release current context
if (context_) {
error_ = _wrapper->clReleaseContext(context_);
CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseContext() failed");
}
// Create new CL context from GL context
context_ =
clCreateContext(properties, 1, &devices_[_deviceId], NULL, NULL, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext() failed (%d)", error_);
// Create command queue for new context
cmdQueues_[_deviceId] =
_wrapper->clCreateCommandQueue(context_, devices_[_deviceId], 0, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed (%d)",
error_);
GLenum glErr = glewInit();
CHECK_RESULT((glErr != GLEW_OK), "glewInit() failed");
}
void OCLGLCommon::makeCurrent(OCLGLHandle hGL) {
if (hGL == NULL) {
wglMakeCurrent(NULL, NULL);
} else {
wglMakeCurrent(hGL->hdc, hGL->hglrc);
}
}
void OCLGLCommon::getCLContextPropertiesFromGLContext(
const OCLGLHandle hGL, cl_context_properties properties[7]) {
if (!properties) return;
properties[0] = CL_CONTEXT_PLATFORM;
properties[1] = (cl_context_properties)platform_;
properties[2] = CL_GL_CONTEXT_KHR;
properties[3] = (cl_context_properties)hGL->hglrc;
properties[4] = CL_WGL_HDC_KHR;
properties[5] = (cl_context_properties)hGL->hdc;
properties[6] = 0;
}
@@ -0,0 +1,288 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLTestImp.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <cstdio>
#include <cstring>
/////////////////////////////////////////////////////////////////////////////
static unsigned int crcinit(unsigned int crc);
static int initializeSeed(void);
/////////////////////////////////////////////////////////////////////////////
OCLutil::Lock OCLTestImp::openDeviceLock;
OCLutil::Lock OCLTestImp::compileLock;
OCLTestImp::OCLTestImp()
: _wrapper(0),
_seed(0),
error_(0),
type_(0),
deviceCount_(0),
devices_(0),
platform_(0),
context_(0),
program_(0),
kernel_(0) {
unsigned int i;
for (i = 0; i < 256; i++) {
_crctab[i] = crcinit(i << 24);
}
_perfInfo = 0;
_wrapper = 0;
_iterationCnt = 0;
_seed = initializeSeed();
_errorMsg = "";
_errorFlag = false;
type_ = CL_DEVICE_TYPE_GPU;
}
OCLTestImp::~OCLTestImp() {}
void OCLTestImp::useCPU() { type_ = CL_DEVICE_TYPE_CPU; }
void OCLTestImp::open(unsigned int test, char* units, double& conversion,
unsigned int deviceId) {
devices_ = 0;
context_ = 0;
program_ = 0;
kernel_ = 0;
deviceCount_ = 0;
open(test, units, conversion, deviceId, getPlatformIndex());
}
void OCLTestImp::open(unsigned int test, char* units, double& conversion,
unsigned int deviceId, unsigned int platformIndex) {
BaseTestImp::open();
devices_ = 0;
deviceCount_ = 0;
context_ = 0;
program_ = 0;
kernel_ = 0;
_deviceId = deviceId;
_platformIndex = platformIndex;
cl_uint numPlatforms = 0;
error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
CHECK_RESULT((error_ != CL_SUCCESS), "clGetPlatformIDs failed");
CHECK_RESULT((numPlatforms == 0), "No platform found");
cl_platform_id* platforms = new cl_platform_id[numPlatforms];
error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
cl_platform_id platform = 0;
#if 0
for(unsigned int i = 0; i < numPlatforms; ++i)
{
char buff[200];
error_ = _wrapper->clGetPlatformInfo(platforms[i],CL_PLATFORM_VENDOR, sizeof(buff), buff, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
if(strcmp(buff, "Advanced Micro Devices, Inc.") == 0)
{
platform = platforms[i];
break;
}
}
#endif
platform = platforms[_platformIndex];
delete[] platforms;
CHECK_RESULT((platform == 0), "AMD Platform not found");
error_ = _wrapper->clGetDeviceIDs(platform, type_, 0, NULL, &deviceCount_);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed");
devices_ = new cl_device_id[deviceCount_];
error_ =
_wrapper->clGetDeviceIDs(platform, type_, deviceCount_, devices_, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed");
cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
(cl_context_properties)platform, 0};
context_ = _wrapper->clCreateContext(props, deviceCount_, devices_, NULL, 0,
&error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext failed");
cl_command_queue cmdQueue;
for (unsigned int i = 0; i < deviceCount_; ++i) {
#ifndef CL_VERSION_2_0
cmdQueue = _wrapper->clCreateCommandQueue(
context_, devices_[i], CL_QUEUE_PROFILING_ENABLE, &error_);
#else
cl_queue_properties prop[] = {CL_QUEUE_PROPERTIES,
CL_QUEUE_PROFILING_ENABLE, 0};
cmdQueue = _wrapper->clCreateCommandQueueWithProperties(
context_, devices_[i], prop, &error_);
#endif
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed");
cmdQueues_.push_back(cmdQueue);
}
platform_ = platform;
}
unsigned int OCLTestImp::close() {
for (unsigned int i = 0; i < buffers().size(); ++i) {
error_ = _wrapper->clReleaseMemObject(buffers()[i]);
CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
"clReleaseMemObject() failed");
}
buffers_.clear();
if (kernel_ != 0) {
error_ = _wrapper->clReleaseKernel(kernel_);
CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseKernel() failed");
}
if (program_ != 0) {
error_ = _wrapper->clReleaseProgram(program_);
CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseProgram() failed");
}
for (unsigned int i = 0; i < cmdQueues_.size(); ++i) {
error_ = _wrapper->clReleaseCommandQueue(cmdQueues_[i]);
CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
"clReleaseCommandQueue() failed");
}
cmdQueues_.clear();
if (context_) {
error_ = _wrapper->clReleaseContext(context_);
CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "clReleaseContext() failed");
}
if (devices_) {
delete[] devices_;
}
return BaseTestImp::close();
}
int OCLTestImp::genBitRand(int n) {
int rslt;
if (n <= 0 || n > 32) {
assert(0);
rslt = 0;
} else if (n < 32) {
_seed = _seed * 1103515245 + 12345;
/*
* return the most-significant n bits; they are the random ones (see
* Knuth, Vol 2)
*/
rslt = (_seed & 0x7fffffff) >> (31 - n);
} else {
rslt = (genBitRand(16) << 16) | genBitRand(16);
}
return rslt;
}
int OCLTestImp::genIntRand(int a, int b) {
int r;
int sign = 1;
int mySmall;
int delta;
int bits = 0;
int rslt;
if (a > b) {
mySmall = b;
delta = a - b;
} else {
mySmall = a;
delta = b - a;
}
if (delta == 0) {
rslt = a;
return (rslt);
} else if (delta < 0) {
sign = -1;
delta = -delta;
}
delta &= 0x7fffffff;
for (r = delta; r > 0; r >>= 1) {
bits++;
}
do {
r = genBitRand(bits);
} while (r > delta);
rslt = mySmall + r * sign;
return (rslt);
}
void OCLTestImp::setOCLWrapper(OCLWrapper* wrapper) { _wrapper = wrapper; }
/////////////////////////////////////////////////////////////////////////////
#ifdef ATI_OS_WIN
#include <windows.h>
static int initializeSeed(void) {
__int64 val;
QueryPerformanceCounter((LARGE_INTEGER*)&val);
return (int)val;
}
#endif // ATI_OS_WIN
/////////////////////////////////////////////////////////////////////////////
#ifdef ATI_OS_LINUX
#include <sys/time.h>
static int initializeSeed(void) {
struct timeval t;
gettimeofday(&t, 0);
return (int)t.tv_usec;
}
#endif // ATI_OS_LINUX
/////////////////////////////////////////////////////////////////////////////
//
// Same CRC32 as used by ogtst
//
static const unsigned int CRCMASK = 0x04c11db7;
static unsigned int crcinit(unsigned int crc) {
int i;
unsigned int ans = crc;
for (i = 0; i < 8; i++) {
if (ans & 0x80000000) {
ans = (ans << 1) ^ CRCMASK;
} else {
ans <<= 1;
}
}
return (ans);
}
@@ -0,0 +1,70 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLTestListImp.h"
#include <stdlib.h>
#include "OCLTest.h"
//
// OCLTestList_TestCount - retrieve the number of tests in the testing module
//
unsigned int OCL_CALLCONV OCLTestList_TestCount(void) { return TestListCount; }
//
// OCLTestList_TestLibVersion - retrieve the version of test lib in the testing
// module
//
unsigned int OCL_CALLCONV OCLTestList_TestLibVersion(void) {
return TestLibVersion;
}
//
// OCLTestList_TestLibName - retrieve the name of test library
//
const char* OCL_CALLCONV OCLTestList_TestLibName(void) { return TestLibName; }
//
// OCLTestList_TestName - retrieve the name of the indexed test in the module
//
const char* OCL_CALLCONV OCLTestList_TestName(unsigned int testNum) {
if (testNum >= OCLTestList_TestCount()) {
return NULL;
}
return TestList[testNum].name;
}
//
// OCLTestList_CreateTest - create a test by index
//
OCLTest* OCL_CALLCONV OCLTestList_CreateTest(unsigned int testNum) {
if (testNum >= OCLTestList_TestCount()) {
return NULL;
}
return reinterpret_cast<OCLTest*>((*TestList[testNum].create)());
}
//
// OCLTestList_DestroyTest - destroy a test object
//
void OCL_CALLCONV OCLTestList_DestroyTest(OCLTest* test) { delete test; }
@@ -0,0 +1,46 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLTestUtils.h"
#include <fstream>
#include <iostream>
bool loadFile(const char* filename, std::string& s) {
size_t size;
char* str;
std::fstream f(filename, std::fstream::in | std::fstream::binary);
if (f.is_open()) {
size_t fileSize;
f.seekg(0, std::fstream::end);
size = fileSize = (size_t)f.tellg();
f.seekg(0, std::fstream::beg);
str = new char[size + 1];
f.read(str, fileSize);
f.close();
str[size] = '\0';
s = str;
delete[] str;
return true;
}
std::cerr << "Error: failed to open file: " << filename << '\n';
return false;
}
@@ -0,0 +1,209 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
//!
//! \file OCLThread.cpp
//!
#include <stdio.h>
#include <stdlib.h>
#include "OCL/Thread.h"
#ifdef ATI_OS_WIN
#include <process.h>
#endif
//! pack the function pointer and data inside this struct
typedef struct __argsToThreadFunc {
oclThreadFunc func;
void *data;
} argsToThreadFunc;
#ifdef ATI_OS_WIN
//! Windows thread callback - invokes the callback set by
//! the application in OCLThread constructor
unsigned _stdcall win32ThreadFunc(void *args) {
argsToThreadFunc *ptr = (argsToThreadFunc *)args;
OCLutil::Thread *obj = (OCLutil::Thread *)ptr->data;
ptr->func(obj->getData());
delete args;
return 0;
}
#endif
////////////////////////////////////////////////////////////////////
//!
//! Constructor for OCLLock
//!
OCLutil::Lock::Lock() {
#ifdef ATI_OS_WIN
InitializeCriticalSection(&_cs);
#else
pthread_mutex_init(&_lock, NULL);
#endif
}
////////////////////////////////////////////////////////////////////
//!
//! Destructor for OCLLock
//!
OCLutil::Lock::~Lock() {
#ifdef ATI_OS_WIN
DeleteCriticalSection(&_cs);
#else
pthread_mutex_destroy(&_lock);
#endif
}
//////////////////////////////////////////////////////////////
//!
//! Try to acquire the lock, wait for the lock if unavailable
//! else hold the lock and enter the protected area
//!
void OCLutil::Lock::lock() {
#ifdef ATI_OS_WIN
EnterCriticalSection(&_cs);
#else
pthread_mutex_lock(&_lock);
#endif
}
//////////////////////////////////////////////////////////////
//!
//! Try to acquire the lock, if unavailable the function returns
//! false and returns true if available(enters the critical
//! section as well in this case).
//!
bool OCLutil::Lock::tryLock() {
#ifdef ATI_OS_WIN
return (TryEnterCriticalSection(&_cs) != 0);
#else
return !((bool)pthread_mutex_trylock(&_lock));
#endif
}
//////////////////////////////////////////////////////////////
//!
//! Unlock the lock
//!
void OCLutil::Lock::unlock() {
#ifdef ATI_OS_WIN
LeaveCriticalSection(&_cs);
#else
pthread_mutex_unlock(&_lock);
#endif
}
////////////////////////////////////////////////////////////////////
//!
//! Constructor for OCLThread
//!
OCLutil::Thread::Thread() : _tid(0), _data(0) {
#ifdef ATI_OS_WIN
_ID = 0;
#else
#endif
}
////////////////////////////////////////////////////////////////////
//!
//! Destructor for OCLLock
//!
OCLutil::Thread::~Thread() {
#ifdef ATI_OS_WIN
CloseHandle(_tid);
#else
#endif
}
//////////////////////////////////////////////////////////////
//!
//! Create a new thread and return the status of the operation
//!
bool OCLutil::Thread::create(oclThreadFunc func, void *arg) {
// Save the data internally
_data = arg;
unsigned int retVal;
bool verbose = getenv("VERBOSE") != NULL;
#ifdef ATI_OS_WIN
// Setup the callback struct for thread function and pass to the
// begin thread routine
// xxx The following struct is allocated but never freed!!!!
argsToThreadFunc *args = new argsToThreadFunc;
args->func = func;
args->data = this;
_tid = (HANDLE)_beginthreadex(NULL, 0, win32ThreadFunc, args, 0, &retVal);
if (verbose) {
printf("Thread handle value = %p\n", _tid);
printf("Done creating thread. Thread id value = %u\n", retVal);
}
#else
//! Now create the thread with pointer to self as the data
retVal = pthread_create(&_tid, NULL, func, arg);
if (verbose)
printf("Done creating thread. Ret value %d, Self = %u\n", retVal,
(unsigned int)pthread_self());
#endif
if (retVal != 0) return false;
return true;
}
//////////////////////////////////////////////////////////////
//!
//! Return the thread ID for the current OCLThread
//!
unsigned int OCLutil::Thread::getID() {
#ifdef ATI_OS_WIN
return GetCurrentThreadId();
// Type cast the thread handle to unsigned in and send it over
#else
return (unsigned int)pthread_self();
#endif
}
//////////////////////////////////////////////////////////////
//!
//! Wait for this thread to join
//!
bool OCLutil::Thread::join() {
#ifdef ATI_OS_WIN
DWORD rc = WaitForSingleObject(_tid, INFINITE);
if (rc == WAIT_FAILED) {
printf("Bad call to function(invalid handle?)\n");
}
#else
int rc = pthread_join(_tid, NULL);
#endif
if (rc != 0) return false;
return true;
}
@@ -0,0 +1,944 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLWrapper.h"
OCLWrapper::OCLWrapper() {
clEnqueueWaitSignalAMD_ptr =
(clEnqueueWaitSignalAMD_fn)clGetExtensionFunctionAddress(
"clEnqueueWaitSignalAMD");
clEnqueueWriteSignalAMD_ptr =
(clEnqueueWriteSignalAMD_fn)clGetExtensionFunctionAddress(
"clEnqueueWriteSignalAMD");
clEnqueueMakeBuffersResidentAMD_ptr =
(clEnqueueMakeBuffersResidentAMD_fn)clGetExtensionFunctionAddress(
"clEnqueueMakeBuffersResidentAMD");
clUnloadPlatformAMD_ptr =
(clUnloadPlatformAMD_fn)clGetExtensionFunctionAddress(
"clUnloadPlatformAMD");
// CL-GL function pointers
clGetGLContextInfoKHR_ptr =
(clGetGLContextInfoKHR_fn)clGetExtensionFunctionAddress(
"clGetGLContextInfoKHR");
clCreateFromGLBuffer_ptr =
(clCreateFromGLBuffer_fn)clGetExtensionFunctionAddress(
"clCreateFromGLBuffer");
clCreateFromGLTexture_ptr =
(clCreateFromGLTexture_fn)clGetExtensionFunctionAddress(
"clCreateFromGLTexture");
clCreateFromGLTexture2D_ptr =
(clCreateFromGLTexture2D_fn)clGetExtensionFunctionAddress(
"clCreateFromGLTexture2D");
clCreateFromGLRenderbuffer_ptr =
(clCreateFromGLRenderbuffer_fn)clGetExtensionFunctionAddress(
"clCreateFromGLRenderbuffer");
clGetGLObjectInfo_ptr =
(clGetGLObjectInfo_fn)clGetExtensionFunctionAddress("clGetGLObjectInfo");
clGetGLTextureInfo_ptr = (clGetGLTextureInfo_fn)clGetExtensionFunctionAddress(
"clGetGLTextureInfo");
clEnqueueAcquireGLObjects_ptr =
(clEnqueueAcquireGLObjects_fn)clGetExtensionFunctionAddress(
"clEnqueueAcquireGLObjects");
clEnqueueReleaseGLObjects_ptr =
(clEnqueueReleaseGLObjects_fn)clGetExtensionFunctionAddress(
"clEnqueueReleaseGLObjects");
// Performance counter function pointers
clCreatePerfCounterAMD_ptr =
(clCreatePerfCounterAMD_fn)clGetExtensionFunctionAddress(
"clCreatePerfCounterAMD");
clEnqueueBeginPerfCounterAMD_ptr =
(clEnqueueBeginPerfCounterAMD_fn)clGetExtensionFunctionAddress(
"clEnqueueBeginPerfCounterAMD");
clEnqueueEndPerfCounterAMD_ptr =
(clEnqueueEndPerfCounterAMD_fn)clGetExtensionFunctionAddress(
"clEnqueueEndPerfCounterAMD");
clGetPerfCounterInfoAMD_ptr =
(clGetPerfCounterInfoAMD_fn)clGetExtensionFunctionAddress(
"clGetPerfCounterInfoAMD");
clReleasePerfCounterAMD_ptr =
(clReleasePerfCounterAMD_fn)clGetExtensionFunctionAddress(
"clReleasePerfCounterAMD");
clRetainPerfCounterAMD_ptr =
(clRetainPerfCounterAMD_fn)clGetExtensionFunctionAddress(
"clRetainPerfCounterAMD");
clSetDeviceClockModeAMD_ptr =
(clSetDeviceClockModeAMD_fn)clGetExtensionFunctionAddress(
"clSetDeviceClockModeAMD");
}
cl_int OCLWrapper::clGetPlatformIDs(cl_uint num_entries,
cl_platform_id *platforms,
cl_uint *num_platforms) {
return ::clGetPlatformIDs(num_entries, platforms, num_platforms);
}
cl_int OCLWrapper::clGetPlatformInfo(cl_platform_id platform,
cl_platform_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) {
return ::clGetPlatformInfo(platform, param_name, param_value_size,
param_value, param_value_size_ret);
}
cl_int OCLWrapper::clGetDeviceIDs(cl_platform_id platform,
cl_device_type device_type,
cl_uint num_entries, cl_device_id *devices,
cl_uint *num_devices) {
return ::clGetDeviceIDs(platform, device_type, num_entries, devices,
num_devices);
}
cl_int OCLWrapper::clGetDeviceInfo(cl_device_id device,
cl_device_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) {
return ::clGetDeviceInfo(device, param_name, param_value_size, param_value,
param_value_size_ret);
}
cl_context OCLWrapper::clCreateContext(
cl_context_properties *properties, cl_uint num_devices,
const cl_device_id *devices,
void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
void *user_data, cl_int *errcode_ret) {
return ::clCreateContext(properties, num_devices, devices, pfn_notify,
user_data, errcode_ret);
}
cl_context OCLWrapper::clCreateContextFromType(
cl_context_properties *properties, cl_device_type device_type,
void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
void *user_data, cl_int *errcode_ret) {
return ::clCreateContextFromType(properties, device_type, pfn_notify,
user_data, errcode_ret);
}
cl_int OCLWrapper::clRetainContext(cl_context context) {
return ::clRetainContext(context);
}
cl_int OCLWrapper::clReleaseContext(cl_context context) {
return ::clReleaseContext(context);
}
cl_int OCLWrapper::clGetContextInfo(cl_context context,
cl_context_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) {
return ::clGetContextInfo(context, param_name, param_value_size, param_value,
param_value_size_ret);
}
cl_command_queue OCLWrapper::clCreateCommandQueue(
cl_context context, cl_device_id device,
cl_command_queue_properties properties, cl_int *errcode_ret) {
#if defined(CL_VERSION_2_0)
cl_int err;
cl_platform_id pid;
bool version20 = true;
err = ::clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id),
&pid, NULL);
if (err == CL_SUCCESS) {
size_t size;
char *ver;
err = ::clGetPlatformInfo(pid, CL_PLATFORM_VERSION, 0, NULL, &size);
if (err == CL_SUCCESS) {
ver = new char[size];
if (ver) {
err = ::clGetPlatformInfo(pid, CL_PLATFORM_VERSION, size, ver, NULL);
if (err == CL_SUCCESS) {
if (ver[8] == '1') {
version20 = false;
}
}
delete[] ver;
}
}
}
if (version20) {
const cl_queue_properties cprops[] = {
CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0};
return ::clCreateCommandQueueWithProperties(
context, device, properties ? cprops : NULL, errcode_ret);
} else {
return ::clCreateCommandQueue(context, device, properties, errcode_ret);
}
#else
return ::clCreateCommandQueue(context, device, properties, errcode_ret);
#endif
}
cl_int OCLWrapper::clRetainCommandQueue(cl_command_queue command_queue) {
return ::clRetainCommandQueue(command_queue);
}
cl_int OCLWrapper::clReleaseCommandQueue(cl_command_queue command_queue) {
return ::clReleaseCommandQueue(command_queue);
}
cl_int OCLWrapper::clGetCommandQueueInfo(cl_command_queue command_queue,
cl_command_queue_info param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret) {
return ::clGetCommandQueueInfo(command_queue, param_name, param_value_size,
param_value, param_value_size_ret);
}
cl_mem OCLWrapper::clCreateBuffer(cl_context context, cl_mem_flags flags,
size_t size, void *host_ptr,
cl_int *errcode_ret) {
return ::clCreateBuffer(context, flags, size, host_ptr, errcode_ret);
}
cl_mem OCLWrapper::clCreateImage2D(cl_context context, cl_mem_flags flags,
const cl_image_format *image_format,
size_t image_width, size_t image_height,
size_t image_row_pitch, void *host_ptr,
cl_int *errcode_ret) {
return ::clCreateImage2D(context, flags, image_format, image_width,
image_height, image_row_pitch, host_ptr,
errcode_ret);
}
cl_mem OCLWrapper::clCreateImage3D(cl_context context, cl_mem_flags flags,
const cl_image_format *image_format,
size_t image_width, size_t image_height,
size_t image_depth, size_t image_row_pitch,
size_t image_slice_pitch, void *host_ptr,
cl_int *errcode_ret) {
return ::clCreateImage3D(context, flags, image_format, image_width,
image_height, image_depth, image_row_pitch,
image_slice_pitch, host_ptr, errcode_ret);
}
cl_int OCLWrapper::clRetainMemObject(cl_mem memobj) {
return ::clRetainMemObject(memobj);
}
cl_int OCLWrapper::clReleaseMemObject(cl_mem memobj) {
return ::clReleaseMemObject(memobj);
}
cl_int OCLWrapper::clGetSupportedImageFormats(cl_context context,
cl_mem_flags flags,
cl_mem_object_type image_type,
cl_uint num_entries,
cl_image_format *image_formats,
cl_uint *num_image_formats) {
return ::clGetSupportedImageFormats(context, flags, image_type, num_entries,
image_formats, num_image_formats);
}
cl_int OCLWrapper::clGetMemObjectInfo(cl_mem memobj, cl_mem_info param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret) {
return ::clGetMemObjectInfo(memobj, param_name, param_value_size, param_value,
param_value_size_ret);
}
cl_int OCLWrapper::clGetImageInfo(cl_mem image, cl_image_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) {
return ::clGetImageInfo(image, param_name, param_value_size, param_value,
param_value_size_ret);
}
cl_sampler OCLWrapper::clCreateSampler(cl_context context,
cl_bool normalized_coords,
cl_addressing_mode addressing_mode,
cl_filter_mode filter_mode,
cl_int *errcode_ret) {
#ifdef CL_VERSION_2_0
const cl_sampler_properties sprops[] = {
CL_SAMPLER_NORMALIZED_COORDS,
static_cast<cl_sampler_properties>(normalized_coords),
CL_SAMPLER_ADDRESSING_MODE,
static_cast<cl_sampler_properties>(addressing_mode),
CL_SAMPLER_FILTER_MODE,
static_cast<cl_sampler_properties>(filter_mode),
0};
return ::clCreateSamplerWithProperties(context, sprops, errcode_ret);
#else
return ::clCreateSampler(context, normalized_coords, addressing_mode,
filter_mode, errcode_ret);
#endif
}
cl_int OCLWrapper::clRetainSampler(cl_sampler sampler) {
return ::clRetainSampler(sampler);
}
cl_int OCLWrapper::clReleaseSampler(cl_sampler sampler) {
return ::clReleaseSampler(sampler);
}
cl_int OCLWrapper::clGetSamplerInfo(cl_sampler sampler,
cl_sampler_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) {
return ::clGetSamplerInfo(sampler, param_name, param_value_size, param_value,
param_value_size_ret);
}
cl_program OCLWrapper::clCreateProgramWithSource(cl_context context,
cl_uint count,
const char **strings,
const size_t *lengths,
cl_int *errcode_ret) {
return ::clCreateProgramWithSource(context, count, strings, lengths,
errcode_ret);
}
cl_program OCLWrapper::clCreateProgramWithBinary(
cl_context context, cl_uint num_devices, const cl_device_id *device_list,
const size_t *lengths, const unsigned char **binaries,
cl_int *binary_status, cl_int *errcode_ret) {
return ::clCreateProgramWithBinary(context, num_devices, device_list, lengths,
binaries, binary_status, errcode_ret);
}
cl_int OCLWrapper::clRetainProgram(cl_program program) {
return ::clRetainProgram(program);
}
cl_int OCLWrapper::clReleaseProgram(cl_program program) {
return ::clReleaseProgram(program);
}
cl_int OCLWrapper::clBuildProgram(
cl_program program, cl_uint num_devices, const cl_device_id *device_list,
const char *options,
void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
void *user_data) {
return ::clBuildProgram(program, num_devices, device_list, options,
pfn_notify, user_data);
}
cl_int OCLWrapper::clCompileProgram(
cl_program program, cl_uint num_devices, const cl_device_id *device_list,
const char *options, cl_uint num_input_headers,
const cl_program *input_headers, const char **header_include_names,
void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
void *user_data) {
return ::clCompileProgram(program, num_devices, device_list, options,
num_input_headers, input_headers,
header_include_names, pfn_notify, user_data);
}
cl_program OCLWrapper::clLinkProgram(
cl_context context, cl_uint num_devices, const cl_device_id *device_list,
const char *options, cl_uint num_input_programs,
const cl_program *input_programs,
void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
void *user_data, cl_int *errcode_ret) {
return ::clLinkProgram(context, num_devices, device_list, options,
num_input_programs, input_programs, pfn_notify,
user_data, errcode_ret);
}
cl_int OCLWrapper::clUnloadCompiler(void) { return ::clUnloadCompiler(); }
cl_int OCLWrapper::clGetProgramInfo(cl_program program,
cl_program_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) {
return ::clGetProgramInfo(program, param_name, param_value_size, param_value,
param_value_size_ret);
}
cl_int OCLWrapper::clGetProgramBuildInfo(
cl_program program, cl_device_id device, cl_program_build_info param_name,
size_t param_value_size, void *param_value, size_t *param_value_size_ret) {
return ::clGetProgramBuildInfo(program, device, param_name, param_value_size,
param_value, param_value_size_ret);
}
cl_kernel OCLWrapper::clCreateKernel(cl_program program,
const char *kernel_name,
cl_int *errcode_ret) {
return ::clCreateKernel(program, kernel_name, errcode_ret);
}
cl_int OCLWrapper::clCreateKernelsInProgram(cl_program program,
cl_uint num_kernels,
cl_kernel *kernels,
cl_uint *num_kernels_ret) {
return ::clCreateKernelsInProgram(program, num_kernels, kernels,
num_kernels_ret);
}
cl_int OCLWrapper::clRetainKernel(cl_kernel kernel) {
return ::clRetainKernel(kernel);
}
cl_int OCLWrapper::clReleaseKernel(cl_kernel kernel) {
return ::clReleaseKernel(kernel);
}
cl_int OCLWrapper::clSetKernelArg(cl_kernel kernel, cl_uint arg_index,
size_t arg_size, const void *arg_value) {
return ::clSetKernelArg(kernel, arg_index, arg_size, arg_value);
}
cl_int OCLWrapper::clGetKernelInfo(cl_kernel kernel, cl_kernel_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) {
return ::clGetKernelInfo(kernel, param_name, param_value_size, param_value,
param_value_size_ret);
}
cl_int OCLWrapper::clGetKernelWorkGroupInfo(
cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name,
size_t param_value_size, void *param_value, size_t *param_value_size_ret) {
return ::clGetKernelWorkGroupInfo(kernel, device, param_name,
param_value_size, param_value,
param_value_size_ret);
}
cl_int OCLWrapper::clWaitForEvents(cl_uint num_events,
const cl_event *event_list) {
return ::clWaitForEvents(num_events, event_list);
}
cl_int OCLWrapper::clGetEventInfo(cl_event evnt, cl_event_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) {
return ::clGetEventInfo(evnt, param_name, param_value_size, param_value,
param_value_size_ret);
}
cl_int OCLWrapper::clRetainEvent(cl_event evnt) {
return ::clRetainEvent(evnt);
}
cl_int OCLWrapper::clReleaseEvent(cl_event evnt) {
return ::clReleaseEvent(evnt);
}
cl_int OCLWrapper::clGetEventProfilingInfo(cl_event evnt,
cl_profiling_info param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret) {
return ::clGetEventProfilingInfo(evnt, param_name, param_value_size,
param_value, param_value_size_ret);
}
cl_int OCLWrapper::clFlush(cl_command_queue command_queue) {
return ::clFlush(command_queue);
}
cl_int OCLWrapper::clFinish(cl_command_queue command_queue) {
return ::clFinish(command_queue);
}
cl_int OCLWrapper::clEnqueueReadBuffer(cl_command_queue command_queue,
cl_mem buffer, cl_bool blocking_read,
size_t offset, size_t cb, void *ptr,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *evnt) {
return ::clEnqueueReadBuffer(command_queue, buffer, blocking_read, offset, cb,
ptr, num_events_in_wait_list, event_wait_list,
evnt);
}
cl_int OCLWrapper::clEnqueueWriteBuffer(
cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
size_t offset, size_t cb, const void *ptr, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt) {
return ::clEnqueueWriteBuffer(command_queue, buffer, blocking_write, offset,
cb, ptr, num_events_in_wait_list,
event_wait_list, evnt);
}
cl_int OCLWrapper::clEnqueueCopyBuffer(cl_command_queue command_queue,
cl_mem src_buffer, cl_mem dst_buffer,
size_t src_offset, size_t dst_offset,
size_t cb,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *evnt) {
return ::clEnqueueCopyBuffer(command_queue, src_buffer, dst_buffer,
src_offset, dst_offset, cb,
num_events_in_wait_list, event_wait_list, evnt);
}
cl_int OCLWrapper::clEnqueueReadBufferRect(
cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
const size_t *buffer_origin, const size_t *host_origin,
const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
size_t host_row_pitch, size_t host_slice_pitch, void *ptr,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *evnt) {
return ::clEnqueueReadBufferRect(
command_queue, buffer, blocking_read, buffer_origin, host_origin, region,
buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch,
ptr, num_events_in_wait_list, event_wait_list, evnt);
}
cl_int OCLWrapper::clEnqueueWriteBufferRect(
cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
const size_t *buffer_origin, const size_t *host_origin,
const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
size_t host_row_pitch, size_t host_slice_pitch, const void *ptr,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *evnt) {
return ::clEnqueueWriteBufferRect(
command_queue, buffer, blocking_write, buffer_origin, host_origin, region,
buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch,
ptr, num_events_in_wait_list, event_wait_list, evnt);
}
cl_int OCLWrapper::clEnqueueCopyBufferRect(
cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
const size_t *src_origin, const size_t *dst_origin, const size_t *region,
size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch,
size_t dst_slice_pitch, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt) {
return ::clEnqueueCopyBufferRect(
command_queue, src_buffer, dst_buffer, src_origin, dst_origin, region,
src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch,
num_events_in_wait_list, event_wait_list, evnt);
}
cl_int OCLWrapper::clEnqueueReadImage(
cl_command_queue command_queue, cl_mem image, cl_bool blocking_read,
const size_t *origin, const size_t *region, size_t row_pitch,
size_t slice_pitch, void *ptr, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt) {
return ::clEnqueueReadImage(command_queue, image, blocking_read, origin,
region, row_pitch, slice_pitch, ptr,
num_events_in_wait_list, event_wait_list, evnt);
}
cl_int OCLWrapper::clEnqueueWriteImage(
cl_command_queue command_queue, cl_mem image, cl_bool blocking_write,
const size_t *origin, const size_t *region, size_t input_row_pitch,
size_t input_slice_pitch, const void *ptr, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt) {
return ::clEnqueueWriteImage(command_queue, image, blocking_write, origin,
region, input_row_pitch, input_slice_pitch, ptr,
num_events_in_wait_list, event_wait_list, evnt);
}
cl_int OCLWrapper::clEnqueueCopyImage(
cl_command_queue command_queue, cl_mem src_image, cl_mem dst_image,
const size_t *src_origin, const size_t *dst_origin, const size_t *region,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *evnt) {
return ::clEnqueueCopyImage(command_queue, src_image, dst_image, src_origin,
dst_origin, region, num_events_in_wait_list,
event_wait_list, evnt);
}
cl_int OCLWrapper::clEnqueueCopyImageToBuffer(
cl_command_queue command_queue, cl_mem src_image, cl_mem dst_buffer,
const size_t *src_origin, const size_t *region, size_t dst_offset,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *evnt) {
return ::clEnqueueCopyImageToBuffer(
command_queue, src_image, dst_buffer, src_origin, region, dst_offset,
num_events_in_wait_list, event_wait_list, evnt);
}
cl_int OCLWrapper::clEnqueueCopyBufferToImage(
cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image,
size_t src_offset, const size_t *dst_origin, const size_t *region,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *evnt) {
return ::clEnqueueCopyBufferToImage(
command_queue, src_buffer, dst_image, src_offset, dst_origin, region,
num_events_in_wait_list, event_wait_list, evnt);
}
void *OCLWrapper::clEnqueueMapBuffer(cl_command_queue command_queue,
cl_mem buffer, cl_bool blocking_map,
cl_map_flags map_flags, size_t offset,
size_t cb, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *evnt, cl_int *errcode_ret) {
return ::clEnqueueMapBuffer(command_queue, buffer, blocking_map, map_flags,
offset, cb, num_events_in_wait_list,
event_wait_list, evnt, errcode_ret);
}
void *OCLWrapper::clEnqueueMapImage(
cl_command_queue command_queue, cl_mem image, cl_bool blocking_map,
cl_map_flags map_flags, const size_t *origin, const size_t *region,
size_t *image_row_pitch, size_t *image_slice_pitch,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *evnt, cl_int *errcode_ret) {
return ::clEnqueueMapImage(command_queue, image, blocking_map, map_flags,
origin, region, image_row_pitch, image_slice_pitch,
num_events_in_wait_list, event_wait_list, evnt,
errcode_ret);
}
cl_int OCLWrapper::clEnqueueUnmapMemObject(cl_command_queue command_queue,
cl_mem memobj, void *mapped_ptr,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *evnt) {
return ::clEnqueueUnmapMemObject(command_queue, memobj, mapped_ptr,
num_events_in_wait_list, event_wait_list,
evnt);
}
cl_int OCLWrapper::clEnqueueNDRangeKernel(
cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim,
const size_t *global_work_offset, const size_t *global_work_size,
const size_t *local_work_size, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt) {
return ::clEnqueueNDRangeKernel(
command_queue, kernel, work_dim, global_work_offset, global_work_size,
local_work_size, num_events_in_wait_list, event_wait_list, evnt);
}
cl_int OCLWrapper::clEnqueueTask(cl_command_queue command_queue,
cl_kernel kernel,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *evnt) {
#if defined(CL_VERSION_2_0)
static size_t const globalWorkSize[3] = {1, 0, 0};
static size_t const localWorkSize[3] = {1, 0, 0};
return ::clEnqueueNDRangeKernel(
command_queue, kernel, 1, NULL, globalWorkSize, localWorkSize,
num_events_in_wait_list, event_wait_list, evnt);
#else
return ::clEnqueueTask(command_queue, kernel, num_events_in_wait_list,
event_wait_list, evnt);
#endif
}
cl_int OCLWrapper::clEnqueueNativeKernel(
cl_command_queue command_queue, void(CL_CALLBACK *user_func)(void *),
void *args, size_t cb_args, cl_uint num_mem_objects, const cl_mem *mem_list,
const void **args_mem_loc, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt) {
return ::clEnqueueNativeKernel(
command_queue, user_func, args, cb_args, num_mem_objects, mem_list,
args_mem_loc, num_events_in_wait_list, event_wait_list, evnt);
}
cl_int OCLWrapper::clEnqueueMarker(cl_command_queue command_queue,
cl_event *evnt) {
return ::clEnqueueMarker(command_queue, evnt);
}
cl_int OCLWrapper::clEnqueueMarkerWithWaitList(cl_command_queue command_queue,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *evnt) {
return ::clEnqueueMarkerWithWaitList(command_queue, num_events_in_wait_list,
event_wait_list, evnt);
}
cl_int OCLWrapper::clEnqueueWaitForEvents(cl_command_queue command_queue,
cl_uint num_events,
const cl_event *event_list) {
return ::clEnqueueWaitForEvents(command_queue, num_events, event_list);
}
cl_int OCLWrapper::clEnqueueBarrier(cl_command_queue command_queue) {
return ::clEnqueueBarrier(command_queue);
}
void *OCLWrapper::clGetExtensionFunctionAddress(const char *func_name) {
return ::clGetExtensionFunctionAddress(func_name);
}
cl_mem OCLWrapper::clCreateImage(cl_context context, cl_mem_flags flags,
const cl_image_format *image_format,
const cl_image_desc *image_desc,
void *host_ptr, cl_int *errcode_ret) {
return ::clCreateImage(context, flags, image_format, image_desc, host_ptr,
errcode_ret);
}
cl_mem OCLWrapper::clCreateSubBuffer(cl_mem mem, cl_mem_flags flags,
cl_buffer_create_type buffer_create_type,
const void *buffer_create_info,
cl_int *errcode_ret) {
return ::clCreateSubBuffer(mem, flags, buffer_create_type, buffer_create_info,
errcode_ret);
}
cl_int OCLWrapper::clSetEventCallback(
cl_event event, cl_int command_exec_callback_type,
void(CL_CALLBACK *pfn_event_notify)(cl_event event,
cl_int event_command_exec_status,
void *user_data),
void *user_data) {
return ::clSetEventCallback(event, command_exec_callback_type,
pfn_event_notify, user_data);
}
cl_int OCLWrapper::clEnqueueFillImage(
cl_command_queue command_queue, cl_mem image, void *ptr,
const size_t *origin, const size_t *region, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *evnt) {
return ::clEnqueueFillImage(command_queue, image, ptr, origin, region,
num_events_in_wait_list, event_wait_list, evnt);
}
cl_int OCLWrapper::clUnloadPlatformAMD(cl_platform_id id) {
if (clUnloadPlatformAMD_ptr) return clUnloadPlatformAMD_ptr(id);
return CL_SUCCESS;
}
cl_int OCLWrapper::clEnqueueWaitSignalAMD(cl_command_queue command_queue,
cl_mem mem_object, cl_uint value,
cl_uint num_events,
const cl_event *event_wait_list,
cl_event *event) {
return clEnqueueWaitSignalAMD_ptr(command_queue, mem_object, value,
num_events, event_wait_list, event);
}
cl_int OCLWrapper::clEnqueueWriteSignalAMD(cl_command_queue command_queue,
cl_mem mem_object, cl_uint value,
cl_ulong offset, cl_uint num_events,
const cl_event *event_list,
cl_event *event) {
return clEnqueueWriteSignalAMD_ptr(command_queue, mem_object, value, offset,
num_events, event_list, event);
}
cl_int OCLWrapper::clEnqueueMakeBuffersResidentAMD(
cl_command_queue command_queue, cl_uint num_mem_objs, cl_mem *mem_objects,
cl_bool blocking_make_resident, cl_bus_address_amd *bus_addresses,
cl_uint num_events, const cl_event *event_list, cl_event *event) {
return clEnqueueMakeBuffersResidentAMD_ptr(
command_queue, num_mem_objs, mem_objects, blocking_make_resident,
bus_addresses, num_events, event_list, event);
}
cl_int OCLWrapper::clEnqueueMigrateMemObjects(cl_command_queue command_queue,
cl_uint num_mem_objects,
const cl_mem *mem_objects,
cl_mem_migration_flags flags,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) {
return ::clEnqueueMigrateMemObjects(
command_queue, num_mem_objects, mem_objects, flags,
num_events_in_wait_list, event_wait_list, event);
}
cl_int OCLWrapper::clGetGLContextInfoKHR(
const cl_context_properties *properties, cl_gl_context_info param_name,
size_t param_value_size, void *param_value, size_t *param_value_size_ret) {
return (*clGetGLContextInfoKHR_ptr)(properties, param_name, param_value_size,
param_value, param_value_size_ret);
}
cl_mem OCLWrapper::clCreateFromGLBuffer(cl_context context, cl_mem_flags flags,
unsigned int bufobj, int *errcode_ret) {
return (*clCreateFromGLBuffer_ptr)(context, flags, bufobj, errcode_ret);
}
cl_mem OCLWrapper::clCreateFromGLTexture(cl_context context, cl_mem_flags flags,
unsigned int texture_target,
int miplevel, unsigned int texture,
cl_int *errcode_ret) {
return (*clCreateFromGLTexture_ptr)(context, flags, texture_target, miplevel,
texture, errcode_ret);
}
cl_mem OCLWrapper::clCreateFromGLTexture2D(cl_context context,
cl_mem_flags flags,
unsigned int texture_target,
int miplevel, unsigned int texture,
cl_int *errcode_ret) {
return (*clCreateFromGLTexture2D_ptr)(context, flags, texture_target,
miplevel, texture, errcode_ret);
}
cl_mem OCLWrapper::clCreateFromGLRenderbuffer(cl_context context,
cl_mem_flags flags,
unsigned int renderbuffer,
cl_int *errcode_ret) {
return (*clCreateFromGLRenderbuffer_ptr)(context, flags, renderbuffer,
errcode_ret);
}
cl_int OCLWrapper::clGetGLObjectInfo(cl_mem memobj,
cl_gl_object_type *gl_object_type,
unsigned int *gl_object_name) {
return (*clGetGLObjectInfo_ptr)(memobj, gl_object_type, gl_object_name);
}
cl_int OCLWrapper::clGetGLTextureInfo(cl_mem memobj,
cl_gl_texture_info param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret) {
return (*clGetGLTextureInfo_ptr)(memobj, param_name, param_value_size,
param_value, param_value_size_ret);
}
cl_int OCLWrapper::clEnqueueAcquireGLObjects(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem *mem_objects,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) {
return (*clEnqueueAcquireGLObjects_ptr)(command_queue, num_objects,
mem_objects, num_events_in_wait_list,
event_wait_list, event);
}
cl_int OCLWrapper::clEnqueueReleaseGLObjects(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem *mem_objects,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) {
return (*clEnqueueReleaseGLObjects_ptr)(command_queue, num_objects,
mem_objects, num_events_in_wait_list,
event_wait_list, event);
}
#if defined(CL_VERSION_2_0)
cl_command_queue OCLWrapper::clCreateCommandQueueWithProperties(
cl_context context, cl_device_id device,
const cl_queue_properties *properties, cl_int *errcode_ret) {
return ::clCreateCommandQueueWithProperties(context, device, properties,
errcode_ret);
}
void *OCLWrapper::clSVMAlloc(cl_context context, cl_svm_mem_flags flags,
size_t size, cl_uint alignment) {
return ::clSVMAlloc(context, flags, size, alignment);
}
void OCLWrapper::clSVMFree(cl_context context, void *svm_pointer) {
return ::clSVMFree(context, svm_pointer);
}
cl_int OCLWrapper::clEnqueueSVMMap(cl_command_queue command_queue,
cl_bool blocking_map, cl_map_flags flags,
void *svm_ptr, size_t size,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) {
return ::clEnqueueSVMMap(command_queue, blocking_map, flags, svm_ptr, size,
num_events_in_wait_list, event_wait_list, event);
}
cl_int OCLWrapper::clEnqueueSVMUnmap(cl_command_queue command_queue,
void *svm_ptr,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) {
return ::clEnqueueSVMUnmap(command_queue, svm_ptr, num_events_in_wait_list,
event_wait_list, event);
}
cl_int OCLWrapper::clEnqueueSVMMemFill(cl_command_queue command_queue,
void *svm_ptr, const void *pattern,
size_t pattern_size, size_t size,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) {
return ::clEnqueueSVMMemFill(command_queue, svm_ptr, pattern, pattern_size,
size, num_events_in_wait_list, event_wait_list,
event);
}
cl_int OCLWrapper::clSetKernelArgSVMPointer(cl_kernel kernel, cl_uint arg_index,
const void *arg_value) {
return ::clSetKernelArgSVMPointer(kernel, arg_index, arg_value);
}
cl_mem OCLWrapper::clCreatePipe(cl_context context, cl_mem_flags flags,
cl_uint packet_size, cl_uint pipe_max_packets,
const cl_pipe_properties *properties,
cl_int *errcode_ret) {
return ::clCreatePipe(context, flags, packet_size, pipe_max_packets,
properties, errcode_ret);
}
cl_int OCLWrapper::clGetPipeInfo(cl_mem pipe, cl_pipe_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) {
return ::clGetPipeInfo(pipe, param_name, param_value_size, param_value,
param_value_size_ret);
}
#endif
cl_perfcounter_amd OCLWrapper::clCreatePerfCounterAMD(
cl_device_id device, cl_perfcounter_property *properties,
cl_int *errcode_ret) {
return (*clCreatePerfCounterAMD_ptr)(device, properties, errcode_ret);
}
cl_int OCLWrapper::clEnqueueBeginPerfCounterAMD(
cl_command_queue command_queue, cl_uint num_perf_counters,
cl_perfcounter_amd *perf_counters, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *event) {
return (*clEnqueueBeginPerfCounterAMD_ptr)(
command_queue, num_perf_counters, perf_counters, num_events_in_wait_list,
event_wait_list, event);
}
cl_int OCLWrapper::clEnqueueEndPerfCounterAMD(cl_command_queue command_queue,
cl_uint num_perf_counters,
cl_perfcounter_amd *perf_counters,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) {
return (*clEnqueueEndPerfCounterAMD_ptr)(
command_queue, num_perf_counters, perf_counters, num_events_in_wait_list,
event_wait_list, event);
}
cl_int OCLWrapper::clGetPerfCounterInfoAMD(cl_perfcounter_amd perf_counter,
cl_perfcounter_info param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret) {
return (*clGetPerfCounterInfoAMD_ptr)(perf_counter, param_name,
param_value_size, param_value,
param_value_size_ret);
}
cl_int OCLWrapper::clReleasePerfCounterAMD(cl_perfcounter_amd perf_counter) {
return (*clReleasePerfCounterAMD_ptr)(perf_counter);
}
cl_int OCLWrapper::clRetainPerfCounterAMD(cl_perfcounter_amd perf_counter) {
return (*clRetainPerfCounterAMD_ptr)(perf_counter);
}
cl_int OCLWrapper::clSetDeviceClockModeAMD(
cl_device_id device,
cl_set_device_clock_mode_input_amd set_clock_mode_input,
cl_set_device_clock_mode_output_amd *set_clock_mode_output) {
return (*clSetDeviceClockModeAMD_ptr)(device, set_clock_mode_input,
set_clock_mode_output);
}
@@ -0,0 +1,112 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "Timer.h"
#ifdef ATI_OS_WIN
#include <windows.h>
#endif
#ifdef ATI_OS_LINUX
#include <time.h>
#define NANOSECONDS_PER_SEC 1000000000
#endif
CPerfCounter::CPerfCounter() : _clocks(0), _start(0) {
#ifdef ATI_OS_WIN
QueryPerformanceFrequency((LARGE_INTEGER *)&_freq);
#endif
#ifdef ATI_OS_LINUX
_freq = NANOSECONDS_PER_SEC;
#endif
}
CPerfCounter::~CPerfCounter() {
// EMPTY!
}
void CPerfCounter::Start(void) {
#ifdef ATI_OS_WIN
if (_start) {
MessageBox(NULL, "Bad Perf Counter Start", "Error", MB_OK);
exit(0);
}
QueryPerformanceCounter((LARGE_INTEGER *)&_start);
#endif
#ifdef ATI_OS_LINUX
struct timespec s;
clock_gettime(CLOCK_MONOTONIC, &s);
_start = (i64)s.tv_sec * NANOSECONDS_PER_SEC + (i64)s.tv_nsec;
#endif
}
void CPerfCounter::Stop(void) {
i64 n;
#ifdef ATI_OS_WIN
if (!_start) {
MessageBox(NULL, "Bad Perf Counter Stop", "Error", MB_OK);
exit(0);
}
QueryPerformanceCounter((LARGE_INTEGER *)&n);
#endif
#ifdef ATI_OS_LINUX
struct timespec s;
clock_gettime(CLOCK_MONOTONIC, &s);
n = (i64)s.tv_sec * NANOSECONDS_PER_SEC + (i64)s.tv_nsec;
#endif
n -= _start;
_start = 0;
_clocks += n;
}
void CPerfCounter::Reset(void) {
#ifdef ATI_OS_WIN
if (_start) {
MessageBox(NULL, "Bad Perf Counter Reset", "Error", MB_OK);
exit(0);
}
#endif
_clocks = 0;
}
double CPerfCounter::GetElapsedTime(void) {
#ifdef ATI_OS_WIN
if (_start) {
MessageBox(NULL, "Trying to get time while still running.", "Error", MB_OK);
exit(0);
}
#endif
return (double)_clocks / (double)_freq;
}
@@ -0,0 +1,46 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _TIMER_H_
#define _TIMER_H_
#ifdef ATI_OS_WIN
typedef __int64 i64;
#endif
#ifdef ATI_OS_LINUX
typedef long long i64;
#endif
class CPerfCounter {
public:
CPerfCounter();
~CPerfCounter();
void Start(void);
void Stop(void);
void Reset(void);
double GetElapsedTime(void);
private:
i64 _freq;
i64 _clocks;
i64 _start;
};
#endif // _TIMER_H_
@@ -0,0 +1,236 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLDX11Common.h"
#define D3D_FEATURE_LEVEL_11_1 0xb100
#define INITPFN(x) \
x = (x##_fn)clGetExtensionFunctionAddressForPlatform(platform_, #x); \
if ((x) == NULL) { \
char* buf = (char*)malloc(4096); \
_errorFlag = true; \
int rc = snprintf(buf, 4096, "Failed to get function pointer for %s", #x); \
assert(rc >= 0 && rc < (int)4096); \
printf("%s:%d - %s\n", __FILE__, __LINE__, buf); \
_errorMsg = std::string(buf); \
_crcword += 1; \
free(buf); \
return; \
}
OCLDX11Common::OCLDX11Common() : OCLTestImp() {
clGetDeviceIDsFromD3D11KHR = NULL;
clCreateFromD3D11BufferKHR = NULL;
clCreateFromD3D11Texture2DKHR = NULL;
clCreateFromD3D11Texture3DKHR = NULL;
clEnqueueAcquireD3D11ObjectsKHR = NULL;
clEnqueueReleaseD3D11ObjectsKHR = NULL;
clGetPlaneFromImageAMD = NULL;
}
OCLDX11Common::~OCLDX11Common() {}
void OCLDX11Common::ExtensionCheck() {
cl_int result = CL_SUCCESS;
char extensions[1024];
result = _wrapper->clGetPlatformInfo(platform_, CL_PLATFORM_EXTENSIONS,
sizeof(extensions), extensions, NULL);
CHECK_RESULT(result != CL_SUCCESS, "Failed to list platform extensions.");
extensionsAvailable =
strstr(extensions, "cl_khr_d3d11_sharing") ? true : false;
if (!extensionsAvailable) {
printf("cl_khr_d3d11_sharing extension is required for this test!\n");
}
OSVERSIONINFOEX versionInfo = {0};
versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
versionInfo.dwMajorVersion = 6;
DWORDLONG conditionMask = 0;
VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, VER_GREATER_EQUAL);
if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION, conditionMask)) {
CHECK_RESULT(!extensionsAvailable,
"Extension should be exported on Windows >= 6");
} else {
CHECK_RESULT(extensionsAvailable,
"Extension should not be exported on Windows < 6");
}
result = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_EXTENSIONS,
sizeof(extensions), extensions, NULL);
CHECK_RESULT(result != CL_SUCCESS, "Failed to list device extensions.");
extensionsAvailable = strstr(extensions, "cl_amd_planar_yuv") ? true : false;
if (!extensionsAvailable) {
printf("cl_amd_planar_yuv extension is required for this test!\n");
}
}
void OCLDX11Common::open(unsigned int test, char* units, double& conversion,
unsigned int deviceId) {
// OpenCL Initialization
// OCLTestImp::open(test, units, conversion, deviceId);
BaseTestImp::open();
devices_ = 0;
deviceCount_ = 0;
context_ = 0;
program_ = 0;
kernel_ = 0;
_queue = 0;
_deviceId = deviceId;
dxD3D11Context = NULL;
dxD3D11Device = NULL;
CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test (%d)", error_);
cl_uint numPlatforms = 0;
error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
CHECK_RESULT((error_ != CL_SUCCESS), "clGetPlatformIDs failed");
CHECK_RESULT((numPlatforms == 0), "No platform found");
cl_platform_id* platforms = new cl_platform_id[numPlatforms];
error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
platform_ = platforms[_platformIndex];
CHECK_RESULT((platform_ == 0), "AMD Platform not found");
delete[] platforms;
error_ = _wrapper->clGetDeviceIDs(platform_, type_, 0, NULL, &deviceCount_);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed");
devices_ = new cl_device_id[deviceCount_];
error_ =
_wrapper->clGetDeviceIDs(platform_, type_, deviceCount_, devices_, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs() failed");
ExtensionCheck();
if (!extensionsAvailable) {
return;
}
// extract function pointers for exported functions
INITPFN(clGetDeviceIDsFromD3D11KHR);
INITPFN(clCreateFromD3D11BufferKHR);
INITPFN(clCreateFromD3D11Texture2DKHR);
INITPFN(clCreateFromD3D11Texture3DKHR);
INITPFN(clEnqueueAcquireD3D11ObjectsKHR);
INITPFN(clEnqueueReleaseD3D11ObjectsKHR);
INITPFN(clGetPlaneFromImageAMD);
char name[1024] = {0};
size_t size = 0;
if (deviceId >= deviceCount_) {
_errorFlag = true;
return;
}
HRESULT hr = S_OK;
UINT createDeviceFlags = 0;
D3D_FEATURE_LEVEL featureLevels[] = {
(D3D_FEATURE_LEVEL)D3D_FEATURE_LEVEL_11_1, D3D_FEATURE_LEVEL_11_0
};
D3D_FEATURE_LEVEL featureLevel;
// Create only the device, not the swapchain. We can't create the swapchain
// anyways without a handle to a window we explicitly own
hr = D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_HARDWARE, NULL,
createDeviceFlags, featureLevels,
_countof(featureLevels), D3D11_SDK_VERSION,
&dxD3D11Device, &featureLevel, &dxD3D11Context);
if (FAILED(hr)) {
hr = D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_HARDWARE, NULL,
createDeviceFlags, featureLevels + 1,
_countof(featureLevels) - 1, D3D11_SDK_VERSION,
&dxD3D11Device, &featureLevel, &dxD3D11Context);
}
if (FAILED(hr)) {
hr = D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_SOFTWARE, NULL,
createDeviceFlags, featureLevels,
_countof(featureLevels), D3D11_SDK_VERSION,
&dxD3D11Device, &featureLevel, &dxD3D11Context);
}
if (FAILED(hr)) {
hr = D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_SOFTWARE, NULL,
createDeviceFlags, featureLevels + 1,
_countof(featureLevels) - 1, D3D11_SDK_VERSION,
&dxD3D11Device, &featureLevel, &dxD3D11Context);
}
cl_int status = 0;
cl_context_properties cps[7] = {
CL_CONTEXT_D3D11_DEVICE_KHR,
(cl_context_properties)(ID3D11Device*)dxD3D11Device,
CL_CONTEXT_INTEROP_USER_SYNC,
CL_FALSE,
CL_CONTEXT_PLATFORM,
(cl_context_properties)platform_,
0};
cl_context_properties* cprops = (NULL == platform_) ? NULL : cps;
cl_uint deviceListSize = 0;
clGetDeviceIDsFromD3D11KHR(platform_, CL_D3D11_DEVICE_KHR, dxD3D11Device,
CL_PREFERRED_DEVICES_FOR_D3D11_KHR, 0, NULL,
&deviceListSize);
std::vector<cl_device_id> devices;
devices.resize(deviceListSize);
clGetDeviceIDsFromD3D11KHR(platform_, CL_D3D11_DEVICE_KHR, dxD3D11Device,
CL_PREFERRED_DEVICES_FOR_D3D11_KHR, deviceListSize,
&devices[0], NULL);
bool ret = false;
// Check that current device can be associated with OpenGL context
for (unsigned int i = 0; i < deviceListSize; i++) {
if (devices[i] == devices_[_deviceId]) {
ret = true;
break;
}
}
if (ret) {
char buf[2000];
_wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS,
sizeof(buf), buf, NULL);
context_ =
clCreateContext(cprops, 1, &devices_[_deviceId], NULL, NULL, &status);
_queue = clCreateCommandQueue(context_, devices_[_deviceId], 0, &status);
}
CHECK_RESULT((ret != true), "Can't find D3D device!");
}
unsigned int OCLDX11Common::close(void) {
clReleaseCommandQueue(_queue);
unsigned int retVal = OCLTestImp::close();
// deleteDXDevice(hDX_);
if (dxD3D11Context) dxD3D11Context->Release();
if (dxD3D11Device) dxD3D11Device->Release();
return retVal;
}
@@ -0,0 +1,68 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_DX11_COMMON_H_
#define _OCL_DX11_COMMON_H_
#include <CL/cl.h>
#include <CL/cl_d3d11.h>
#include "OCLTestImp.h"
#include "d3d11.h"
typedef CL_API_ENTRY cl_mem(CL_API_CALL* clGetPlaneFromImageAMD_fn)(
cl_context /* context */, cl_mem /* mem */, cl_uint /* plane */,
cl_int* /* errcode_ret */);
class OCLDX11Common : public OCLTestImp {
public:
// S///////////////////////////////////////
// private initialization and clean-up //
/////////////////////////////////////////
OCLDX11Common();
virtual ~OCLDX11Common();
///////////////////////
// virtual interface //
///////////////////////
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceId);
virtual unsigned int close(void);
protected:
bool extensionsAvailable;
ID3D11Device* dxD3D11Device;
ID3D11DeviceContext* dxD3D11Context;
ID3D11Texture2D* dxDX11Texture;
cl_command_queue _queue;
clGetDeviceIDsFromD3D11KHR_fn clGetDeviceIDsFromD3D11KHR;
clCreateFromD3D11BufferKHR_fn clCreateFromD3D11BufferKHR;
clCreateFromD3D11Texture2DKHR_fn clCreateFromD3D11Texture2DKHR;
clCreateFromD3D11Texture3DKHR_fn clCreateFromD3D11Texture3DKHR;
clEnqueueAcquireD3D11ObjectsKHR_fn clEnqueueAcquireD3D11ObjectsKHR;
clEnqueueReleaseD3D11ObjectsKHR_fn clEnqueueReleaseD3D11ObjectsKHR;
clGetPlaneFromImageAMD_fn clGetPlaneFromImageAMD;
private:
void ExtensionCheck();
};
#endif // _OCL_DX11_COMMON_H_
@@ -0,0 +1,478 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLDX11YUY2.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#define DXGI_FORMAT_NV12 103
#define DXGI_FORMAT_P010 104
#define GROUP_SIZE 256
const static char strKernel[] =
"__constant sampler_t imageSampler = CLK_NORMALIZED_COORDS_FALSE | "
"CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \n"
"__kernel void image2imageCopy( "
" \n"
" __read_only image2d_t input, "
" \n"
" __write_only image2d_t output) "
" \n"
"{ "
" \n"
" int2 coord = (int2)(get_global_id(0), get_global_id(1)); "
" \n"
" uint4 temp = read_imageui(input, imageSampler, coord); "
" \n"
" write_imageui(output, coord, temp); "
" \n"
"} "
" \n";
OCLDX11YUY2::OCLDX11YUY2() : OCLDX11Common() {
_numSubTests = 4;
blockSizeX = GROUP_SIZE;
blockSizeY = 1;
}
OCLDX11YUY2::~OCLDX11YUY2() {}
void OCLDX11YUY2::open(unsigned int test, char *units, double &conversion,
unsigned int deviceId) {
dxDX11Texture = 0;
clImage2DOut = 0;
_openTest = test;
// Initialize random number seed
srand((unsigned int)time(NULL));
OCLDX11Common::open(test, units, conversion, deviceId);
if (_errorFlag) return;
if (!extensionsAvailable) {
return;
}
if (_openTest < 2) {
dxFormat = (DXGI_FORMAT)DXGI_FORMAT_NV12;
extensionsAvailable = formatSupported();
if (!extensionsAvailable) {
printf("DXGI_FORMAT_NV12 is required for this test!\n");
return;
}
} else {
dxFormat = (DXGI_FORMAT)DXGI_FORMAT_P010;
extensionsAvailable = formatSupported();
if (!extensionsAvailable) {
printf("DXGI_FORMAT_P010 is required for this test!\n");
return;
}
}
CompileKernel();
AllocateOpenCLImage();
}
void OCLDX11YUY2::run(void) {
if (_errorFlag) return;
if (!extensionsAvailable) return;
D3D11_TEXTURE2D_DESC Desc = {0};
Desc.ArraySize = 1;
Desc.BindFlags = 0;
Desc.Format = dxFormat;
Desc.Width = OCLDX11YUY2::WIDTH;
Desc.Height = OCLDX11YUY2::HEIGHT;
Desc.MipLevels = 1;
Desc.SampleDesc.Count = 1;
// Desc.MiscFlags=D3D11_RESOURCE_MISC_SHARED; //MM for fast GPU interop
// MM: these flags are incompatible with D3D11_RESOURCE_MISC_SHARED
// now we allocate texture without CPU access and if needed use temp texture
// (see FromSystemToDX11 and FromDX11ToSystem)
Desc.Usage = D3D11_USAGE_STAGING;
Desc.BindFlags = 0;
Desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE | D3D11_CPU_ACCESS_READ;
ID3D11Texture2D *pTextureTmp;
HRESULT hr = dxD3D11Device->CreateTexture2D(&Desc, NULL, &pTextureTmp);
// fill memory
D3D11_MAPPED_SUBRESOURCE LockedRectD11;
if (SUCCEEDED(hr)) {
hr =
dxD3D11Context->Map(pTextureTmp, 0, D3D11_MAP_WRITE, 0, &LockedRectD11);
}
if (SUCCEEDED(hr)) {
// fill memory with something
for (int y = 0; y < OCLDX11YUY2::HEIGHT; y++) {
BYTE *pLine = (BYTE *)LockedRectD11.pData + y * LockedRectD11.RowPitch;
BYTE *pLineUV = (BYTE *)LockedRectD11.pData + y * LockedRectD11.RowPitch +
OCLDX11YUY2::HEIGHT * LockedRectD11.RowPitch;
for (int x = 0; x < OCLDX11YUY2::WIDTH; x++) {
*pLine++ = 0x7F; // Y
if (y < OCLDX11YUY2::HEIGHT / 2 && x < OCLDX11YUY2::WIDTH / 2) {
*pLineUV++ = 0x1F; // U
*pLineUV++ = 0x2F; // V
}
}
}
dxD3D11Context->Unmap(pTextureTmp, 0);
}
Desc.BindFlags = D3D11_BIND_RENDER_TARGET | D3D11_BIND_SHADER_RESOURCE;
Desc.Usage = D3D11_USAGE_DEFAULT;
Desc.CPUAccessFlags = 0;
Desc.MiscFlags = (_openTest == 0)
? 0
: D3D11_RESOURCE_MISC_SHARED; // MM for fast GPU interop
hr = dxD3D11Device->CreateTexture2D(&Desc, NULL, &dxDX11Texture);
if (pTextureTmp != NULL) {
dxD3D11Context->CopySubresourceRegion(dxDX11Texture, 0, 0, 0, 0,
pTextureTmp, 0, NULL);
pTextureTmp->Release();
}
testInterop();
}
void OCLDX11YUY2::AllocateOpenCLImage() {
cl_int status = 0;
cl_image_format format{};
format.image_channel_order = CL_R;
format.image_channel_data_type =
(dxFormat == DXGI_FORMAT_NV12) ? CL_UNSIGNED_INT8 : CL_UNSIGNED_INT16;
cl_image_desc descr{};
descr.image_type = CL_MEM_OBJECT_IMAGE2D;
descr.image_width = WIDTH;
descr.image_height = HEIGHT + HEIGHT / 2;
clImage2DOut = clCreateImage(context_, CL_MEM_WRITE_ONLY, &format, &descr,
NULL, &status);
CHECK_RESULT((status != CL_SUCCESS), "AllocateOpenCLImage() failed");
}
void OCLDX11YUY2::testInterop() {
// alloc
cl_int clStatus = 0;
cl_mem clImage2D =
clCreateFromD3D11Texture2DKHR(context_, 0, dxDX11Texture, 0, &clStatus);
CHECK_RESULT((clStatus != CL_SUCCESS),
"clCreateFromD3D11Texture2DKHR() failed");
// bring objects to the queue
cl_event clEvent = NULL;
clEnqueueAcquireD3D11ObjectsKHR(_queue, 1, &clImage2D, 0, NULL, &clEvent);
clStatus = clWaitForEvents(1, &clEvent);
clReleaseEvent(clEvent);
CopyOpenCLImage(clImage2D);
bool ImageReadWorks = CheckCLImage(clImage2D);
bool bKernelWorks = CheckCLImage(clImage2DOut);
CHECK_RESULT_NO_RETURN((ImageReadWorks != true),
"CheckCLImage(clImage2D) failed");
CHECK_RESULT_NO_RETURN((bKernelWorks != true),
"CheckCLImage(clImage2DOut) failed");
cl_mem planeY = clGetPlaneFromImageAMD(context_, clImage2D, 0, &clStatus);
CHECK_RESULT((clStatus != CL_SUCCESS),
"clGetPlaneFromImageAMD(context_,clImage2D,0,&clStatus) failed");
cl_mem planeUV = clGetPlaneFromImageAMD(context_, clImage2D, 1, &clStatus);
CHECK_RESULT((clStatus != CL_SUCCESS),
"clGetPlaneFromImageAMD(context_,clImage2D,1,&clStatus) failed");
bool ImageWorksY = CheckCLImageY(planeY);
bool ImageWorksUV = CheckCLImageUV(planeUV);
clReleaseMemObject(planeY);
clReleaseMemObject(planeUV);
// release
clEvent = NULL;
// release object from the queue
clStatus =
clEnqueueReleaseD3D11ObjectsKHR(_queue, 1, &clImage2D, 0, NULL, &clEvent);
clStatus = clWaitForEvents(1, &clEvent);
clReleaseEvent(clEvent);
// release mem object
clReleaseMemObject(clImage2D);
CHECK_RESULT_NO_RETURN((ImageWorksY != true), "CheckCLImageY() failed");
CHECK_RESULT_NO_RETURN((ImageWorksUV != true), "CheckCLImageUV() failed");
}
unsigned int OCLDX11YUY2::close(void) {
if (clImage2DOut) clReleaseMemObject(clImage2DOut);
if (dxDX11Texture) dxDX11Texture->Release();
return OCLDX11Common::close();
}
bool OCLDX11YUY2::CheckCLImage(cl_mem clImage) {
cl_int clStatus = 0;
size_t pitch = 0;
clStatus =
clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(pitch), &pitch, NULL);
pitch *= 2;
cl_image_format format;
clStatus =
clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
size_t height;
clStatus =
clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
CHECK_RESULT_NO_RETURN(height != (HEIGHT + HEIGHT / 2),
"CheckCLImage: height!=(HEIGHT+HEIGHT/2)");
char *pTempBuffer = new char[(HEIGHT + HEIGHT / 2) * pitch];
size_t origin[] = {0, 0, 0};
size_t region[] = {WIDTH, HEIGHT + HEIGHT / 2, 1};
clStatus = clEnqueueReadImage(_queue, clImage, 1, origin, region, pitch, 0,
pTempBuffer, 0, 0, 0);
::clFinish(_queue);
// test
bool bBreak = false;
for (int y = 0; y < HEIGHT && !bBreak; y++) {
char *pLine = (char *)pTempBuffer + y * pitch;
char *pLineUV = (char *)pTempBuffer + y * pitch + HEIGHT * pitch;
for (int x = 0; x < WIDTH; x++) {
if (*pLine != 0x7F) // Y
{
bBreak = true;
break;
}
pLine++;
if (y < HEIGHT / 2 && x < WIDTH / 2) {
if (*pLineUV != 0x1F) // U
{
bBreak = true;
break;
}
pLineUV++;
if (*pLineUV != 0x2F) // V
{
bBreak = true;
break;
}
pLineUV++;
}
}
}
delete[] pTempBuffer;
return !bBreak;
}
bool OCLDX11YUY2::CheckCLImageY(cl_mem clImage) {
cl_int clStatus = 0;
size_t pitch = 0;
clStatus =
clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(pitch), &pitch, NULL);
pitch *= 2;
cl_image_format format;
clStatus =
clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
size_t height;
clStatus =
clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
CHECK_RESULT_NO_RETURN(height != HEIGHT, "CheckCLImageY: height!=HEIGHT");
char *pTempBuffer = new char[HEIGHT * pitch];
size_t origin[] = {0, 0, 0};
size_t region[] = {WIDTH, HEIGHT, 1};
clStatus = clEnqueueReadImage(_queue, clImage, 1, origin, region, pitch, 0,
pTempBuffer, 0, 0, 0);
::clFinish(_queue);
// test
bool bBreak = false;
for (int y = 0; y < HEIGHT && !bBreak; y++) {
char *pLine = (char *)pTempBuffer + y * pitch;
for (int x = 0; x < WIDTH; x++) {
if (*pLine != 0x7F) // Y
{
bBreak = true;
break;
}
pLine++;
}
}
delete[] pTempBuffer;
return !bBreak;
}
bool OCLDX11YUY2::CheckCLImageUV(cl_mem clImage) {
cl_int clStatus = 0;
size_t pitch = 0;
clStatus =
clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(pitch), &pitch, NULL);
pitch *= 2;
size_t width = 0;
clStatus =
clGetImageInfo(clImage, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
cl_image_format format;
clStatus =
clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
size_t height;
clStatus =
clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
CHECK_RESULT_NO_RETURN(height != HEIGHT / 2,
"CheckCLImageUV: height!=HEIGHT/2");
char *pTempBuffer = new char[(HEIGHT / 2) * pitch];
size_t origin[] = {0, 0, 0};
size_t region[] = {WIDTH / 2, HEIGHT / 2, 1};
clStatus = clEnqueueReadImage(_queue, clImage, 1, origin, region, pitch, 0,
pTempBuffer, 0, 0, 0);
::clFinish(_queue);
bool bBreak = false;
for (int y = 0; y < HEIGHT / 2 && !bBreak; y++) {
char *pLineUV = (char *)pTempBuffer + y * pitch;
for (int x = 0; x < WIDTH / 2; x++) {
if (*pLineUV != 0x1F) // U
{
bBreak = true;
break;
}
pLineUV++;
if (*pLineUV != 0x2F) // V
{
bBreak = true;
break;
}
pLineUV++;
}
}
delete[] pTempBuffer;
return !bBreak;
}
void OCLDX11YUY2::CopyOpenCLImage(cl_mem clImageSrc) {
cl_int status = 0;
// Set appropriate arguments to the kernel2D
// input buffer image
status = clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clImageSrc);
CHECK_RESULT((status != CL_SUCCESS),
"CopyOpenCLImage() failed at "
"clSetKernelArg(kernel_,0,sizeof(cl_mem),&clImageSrc)");
status = clSetKernelArg(kernel_, 1, sizeof(cl_mem), &clImage2DOut);
CHECK_RESULT((status != CL_SUCCESS),
"CopyOpenCLImage() failed at "
"clSetKernelArg(kernel_,1,sizeof(cl_mem),&clImage2DOut)");
// Enqueue a kernel run call.
size_t global_work_offset[] = {0, 0};
size_t globalThreads[] = {WIDTH, HEIGHT + HEIGHT / 2};
size_t localThreads[] = {blockSizeX, blockSizeY};
// status =
// clEnqueueNDRangeKernel(_queue,kernel_,2,NULL,globalThreads,localThreads,0,NULL,0);
status = clEnqueueNDRangeKernel(_queue, kernel_, 2, NULL, globalThreads, NULL,
0, NULL, 0);
CHECK_RESULT((status != CL_SUCCESS),
"CopyOpenCLImage() failed at clEnqueueNDRangeKernel");
status = clFinish(_queue);
CHECK_RESULT((status != CL_SUCCESS), "CopyOpenCLImage() failed at clFinish");
}
void OCLDX11YUY2::CompileKernel() {
cl_int status = 0;
size_t kernelSize = sizeof(strKernel);
const char *strs = (const char *)&strKernel[0];
program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strs,
&kernelSize, &status);
status = _wrapper->clBuildProgram(program_, 1, &devices_[_deviceId], NULL,
NULL, NULL);
if (status != CL_SUCCESS) {
if (status == CL_BUILD_PROGRAM_FAILURE) {
cl_int logStatus;
size_t buildLogSize = 0;
logStatus = clGetProgramBuildInfo(program_, devices_[_deviceId],
CL_PROGRAM_BUILD_LOG, buildLogSize,
NULL, &buildLogSize);
std::string buildLog;
buildLog.resize(buildLogSize);
logStatus = clGetProgramBuildInfo(program_, devices_[_deviceId],
CL_PROGRAM_BUILD_LOG, buildLogSize,
&buildLog[0], NULL);
printf("%s", buildLog.c_str());
}
return;
}
// get a kernel object handle for a kernel with the given name
kernel_ = _wrapper->clCreateKernel(program_, "image2imageCopy", &status);
size_t kernel2DWorkGroupSize = 0;
status = clGetKernelWorkGroupInfo(kernel_, devices_[_deviceId],
CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t),
&kernel2DWorkGroupSize, 0);
if ((blockSizeX * blockSizeY) > kernel2DWorkGroupSize) {
if (blockSizeX > kernel2DWorkGroupSize) {
blockSizeX = kernel2DWorkGroupSize;
blockSizeY = 1;
}
}
}
bool OCLDX11YUY2::formatSupported() {
UINT supported = 0u;
dxD3D11Device->CheckFormatSupport(dxFormat, (UINT *)&supported);
return supported & D3D11_FORMAT_SUPPORT_TEXTURE2D;
}
@@ -0,0 +1,56 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_DX11_YUY2_H_
#define _OCL_DX11_YUY2_H_
#include "OCLDX11Common.h"
class OCLDX11YUY2 : public OCLDX11Common {
public:
OCLDX11YUY2();
virtual ~OCLDX11YUY2();
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceId);
virtual void run(void);
virtual unsigned int close(void);
protected:
static const unsigned int WIDTH = 1280;
static const unsigned int HEIGHT = 720;
void testInterop();
void AllocateOpenCLImage();
bool CheckCLImage(cl_mem clImage);
bool CheckCLImageY(cl_mem clImage);
bool CheckCLImageUV(cl_mem clImage);
void CopyOpenCLImage(cl_mem clImageSrc);
void CompileKernel();
bool formatSupported();
void testFormat();
size_t blockSizeX; /**< Work-group size in x-direction */
size_t blockSizeY; /**< Work-group size in y-direction */
cl_mem clImage2DOut;
DXGI_FORMAT dxFormat;
};
#endif // _OCL_DX11_YUY2_H_
@@ -0,0 +1,52 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLTestListImp.h"
//
// Includes for tests
//
#ifdef ATI_OS_WIN
#include "OCLDX11YUY2.h"
#endif
//
// Helper macro for adding tests
//
template <typename T>
static void* dictionary_CreateTestFunc(void) {
return new T();
}
#define TEST(name) \
{ #name, &dictionary_CreateTestFunc < name> }
#ifdef ATI_OS_WIN
TestEntry TestList[] = {TEST(OCLDX11YUY2)};
unsigned int TestListCount = sizeof(TestList) / sizeof(TestList[0]);
#else
TestEntry TestList[] = {{"void", 0}};
unsigned int TestListCount = 0;
#endif
unsigned int TestLibVersion = 0;
const char* TestLibName = "ocldx";
@@ -0,0 +1 @@
# all clear
@@ -0,0 +1,220 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLGLBuffer.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
const static char* strKernel =
"__kernel void glbuffer_test( __global uint4 *source, __global uint4 "
"*glDest, __global uint4 *clDest) \n"
"{ "
" \n"
" int tid = get_global_id(0); "
" \n"
" clDest[ tid ] = source[ tid ] + (uint4)(1); "
" \n"
" glDest[ tid ] = source[ tid ] + (uint4)(2); "
" \n"
"} "
" \n";
OCLGLBuffer::OCLGLBuffer() : inGLBuffer_(0), outGLBuffer_(0) {
_numSubTests = 1;
}
OCLGLBuffer::~OCLGLBuffer() {}
void OCLGLBuffer::open(unsigned int test, char* units, double& conversion,
unsigned int deviceId) {
// Initialize random number seed
srand((unsigned int)time(NULL));
OCLGLCommon::open(test, units, conversion, deviceId);
if (_errorFlag) return;
// Build the kernel
program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
&error_);
CHECK_RESULT((error_ != CL_SUCCESS),
"clCreateProgramWithSource() failed (%d)", error_);
error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
NULL, NULL);
if (error_ != CL_SUCCESS) {
char programLog[1024];
_wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
printf("\n%s\n", programLog);
fflush(stdout);
}
CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_);
kernel_ = _wrapper->clCreateKernel(program_, "glbuffer_test", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_);
}
void OCLGLBuffer::run(void) {
if (_errorFlag) {
return;
}
cl_mem buffer;
cl_uint4 inData[c_numOfElements] = {{{0}}};
cl_uint4 outDataCL[c_numOfElements] = {{{0}}};
cl_uint4 outDataGL[c_numOfElements] = {{{0}}};
// Initialize input data with random values
for (unsigned int i = 0; i < c_numOfElements; i++) {
for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) {
inData[i].s[j] = (unsigned int)rand();
}
}
// Generate and Bind in & out OpenGL buffers
glGenBuffers(1, &inGLBuffer_);
glGenBuffers(1, &outGLBuffer_);
glBindBuffer(GL_ARRAY_BUFFER, inGLBuffer_);
glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), inData,
GL_STATIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer_);
glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), outDataGL,
GL_STATIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, 0);
glFinish();
// Create input buffer from GL input buffer
buffer = _wrapper->clCreateFromGLBuffer(context_, CL_MEM_READ_ONLY,
inGLBuffer_, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "Unable to create input GL buffer (%d)",
error_);
buffers_.push_back(buffer);
// Create output buffer from GL output buffer
buffer = _wrapper->clCreateFromGLBuffer(context_, CL_MEM_WRITE_ONLY,
outGLBuffer_, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "Unable to create output GL buffer (%d)",
error_);
buffers_.push_back(buffer);
// Create a CL output buffer
buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE,
c_numOfElements * sizeof(cl_uint4), NULL,
&error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed (%d)", error_);
buffers_.push_back(buffer);
// Assign args and execute
for (unsigned int i = 0; i < buffers_.size(); i++) {
error_ =
_wrapper->clSetKernelArg(kernel_, i, sizeof(cl_mem), &buffers()[i]);
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
error_);
}
error_ = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 2,
&buffers()[0], 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)",
error_);
size_t gws[1] = {c_numOfElements};
error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
NULL, gws, NULL, 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed (%d)",
error_);
error_ = _wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 2,
&buffers()[0], 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReleaseGLObjects failed (%d)",
error_);
error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
CHECK_RESULT((error_ != CL_SUCCESS), "clFinish() failed (%d)", error_);
// Get the results from both CL and GL buffers
error_ = _wrapper->clEnqueueReadBuffer(
cmdQueues_[_deviceId], buffers()[2], CL_TRUE, 0,
c_numOfElements * sizeof(cl_uint4), outDataCL, 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "Unable to read output CL array! (%d)",
error_);
glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer_);
void* glMem = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY);
memcpy(outDataGL, glMem, c_numOfElements * sizeof(cl_uint4));
glUnmapBuffer(GL_ARRAY_BUFFER);
cl_uint4 expectedCL = {{0}};
cl_uint4 expectedGL = {{0}};
// Check output
for (unsigned int i = 0; i < c_numOfElements; ++i) {
// Calculate expected value in CL output buffer (input + 1)
expectedCL = inData[i];
expectedCL.s[0]++;
expectedCL.s[1]++;
expectedCL.s[2]++;
expectedCL.s[3]++;
// Calculate expected value in GL output buffer (input + 2)
expectedGL = inData[i];
expectedGL.s[0] += 2;
expectedGL.s[1] += 2;
expectedGL.s[2] += 2;
expectedGL.s[3] += 2;
// Compare expected output with actual data received
for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) {
CHECK_RESULT((outDataCL[i].s[j] != expectedCL.s[j]),
"Element %d in CL output buffer is incorrect!\n\t \
expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}",
i, expectedCL.s[0], expectedCL.s[1], expectedCL.s[2],
expectedCL.s[3], outDataCL[i].s[0], outDataCL[i].s[1],
outDataCL[i].s[2], outDataCL[i].s[3]);
CHECK_RESULT((outDataGL[i].s[j] != expectedGL.s[j]),
"Element %d in GL output buffer is incorrect!\n\t \
expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}",
i, expectedGL.s[0], expectedGL.s[1], expectedGL.s[2],
expectedGL.s[3], outDataGL[i].s[0], outDataGL[i].s[1],
outDataGL[i].s[2], outDataGL[i].s[3]);
}
}
}
unsigned int OCLGLBuffer::close(void) {
for (unsigned int i = 0; i < buffers().size(); ++i) {
clReleaseMemObject(buffers()[i]);
}
buffers_.clear();
// Delete GL in & out buffers
glBindBuffer(GL_ARRAY_BUFFER, 0);
glDeleteBuffers(1, &inGLBuffer_);
inGLBuffer_ = 0;
glDeleteBuffers(1, &outGLBuffer_);
outGLBuffer_ = 0;
return OCLGLCommon::close();
}
@@ -0,0 +1,42 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_GL_BUFFER_H_
#define _OCL_GL_BUFFER_H_
#include "OCLGLCommon.h"
class OCLGLBuffer : public OCLGLCommon {
public:
OCLGLBuffer();
virtual ~OCLGLBuffer();
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceId);
virtual void run(void);
virtual unsigned int close(void);
private:
static const unsigned int c_numOfElements = 1024;
GLuint inGLBuffer_;
GLuint outGLBuffer_;
};
#endif // _OCL_GL_BUFFER_H_
@@ -0,0 +1,303 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLGLBufferMultipleQueues.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
const static char* strKernel =
"__kernel void glbuffer_test( __global uint4 *source, __global uint4 "
"*glDest, __global uint4 *clDest) \n"
"{ "
" \n"
" int tid = get_global_id(0); "
" \n"
" glDest[ tid ] = source[ tid ] + (uint4)(2); "
" \n"
" clDest[ tid ] = source[ tid ] + (uint4)(1); "
" \n"
"} "
" \n";
OCLGLBufferMultipleQueues::OCLGLBufferMultipleQueues() { _numSubTests = 1; }
OCLGLBufferMultipleQueues::~OCLGLBufferMultipleQueues() {}
void OCLGLBufferMultipleQueues::open(unsigned int test, char* units,
double& conversion,
unsigned int deviceId) {
// Initialize random number seed
srand((unsigned int)time(NULL));
OCLGLCommon::open(test, units, conversion, deviceId);
if (_errorFlag) return;
// Create multiple queues for the device (first add already created queue in
// OCLGLCommon::open, then add a second queue)
deviceCmdQueues_.resize(QUEUES_PER_DEVICE_COUNT);
deviceCmdQueues_[0] = cmdQueues_[deviceId];
for (int queueIndex = 1; queueIndex < QUEUES_PER_DEVICE_COUNT; queueIndex++) {
cl_command_queue cmdQueue = _wrapper->clCreateCommandQueue(
context_, devices_[deviceId], 0, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed");
deviceCmdQueues_[queueIndex] = cmdQueue;
}
// Build the kernel
program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
&error_);
CHECK_RESULT((error_ != CL_SUCCESS),
"clCreateProgramWithSource() failed (%d)", error_);
error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
NULL, NULL);
if (error_ != CL_SUCCESS) {
char programLog[1024];
_wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
printf("\n%s\n", programLog);
fflush(stdout);
}
CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_);
kernel_ = _wrapper->clCreateKernel(program_, "glbuffer_test", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_);
}
void OCLGLBufferMultipleQueues::run(void) {
if (_errorFlag) {
return;
}
inputGLBufferPerQueue_.resize(QUEUES_PER_DEVICE_COUNT, NULL);
outputGLBufferPerQueue_.resize(QUEUES_PER_DEVICE_COUNT, NULL);
outputCLBufferPerQueue_.resize(QUEUES_PER_DEVICE_COUNT, NULL);
std::vector<std::vector<cl_uint4> > inData(
QUEUES_PER_DEVICE_COUNT); // Input data per queue
inGLBufferIDs_.resize(QUEUES_PER_DEVICE_COUNT, 0);
outGLBufferIDs_.resize(QUEUES_PER_DEVICE_COUNT, 0);
for (int queueIndex = 0; queueIndex < QUEUES_PER_DEVICE_COUNT; queueIndex++) {
// Initialize input data with random values
inData[queueIndex].resize(BUFFER_ELEMENTS_COUNT);
for (int i = 0; i < BUFFER_ELEMENTS_COUNT; i++) {
for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) {
inData[queueIndex][i].s[j] = (unsigned int)rand();
}
}
// Generate and Bind in & out OpenGL buffers
glGenBuffers(1, &inGLBufferIDs_[queueIndex]);
glGenBuffers(1, &outGLBufferIDs_[queueIndex]);
glBindBuffer(GL_ARRAY_BUFFER, inGLBufferIDs_[queueIndex]);
glBufferData(GL_ARRAY_BUFFER, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4),
&inData[queueIndex][0], GL_STATIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, outGLBufferIDs_[queueIndex]);
glBufferData(GL_ARRAY_BUFFER, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4),
NULL, GL_STATIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, 0);
glFinish();
// Create input buffer from GL input buffer
inputGLBufferPerQueue_[queueIndex] = _wrapper->clCreateFromGLBuffer(
context_, CL_MEM_READ_ONLY, inGLBufferIDs_[queueIndex], &error_);
CHECK_RESULT((error_ != CL_SUCCESS),
"Unable to create input GL buffer (%d)", error_);
// Create output buffer from GL output buffer
outputGLBufferPerQueue_[queueIndex] = _wrapper->clCreateFromGLBuffer(
context_, CL_MEM_WRITE_ONLY, outGLBufferIDs_[queueIndex], &error_);
CHECK_RESULT((error_ != CL_SUCCESS),
"Unable to create output GL buffer (%d)", error_);
// Create a CL output buffer
outputCLBufferPerQueue_[queueIndex] = _wrapper->clCreateBuffer(
context_, CL_MEM_WRITE_ONLY, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4),
NULL, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed (%d)",
error_);
}
for (int queueIndex = 0; queueIndex < QUEUES_PER_DEVICE_COUNT; queueIndex++) {
// Assign arguments to kernel according to queue index
error_ = _wrapper->clSetKernelArg(
kernel_, 0, sizeof(cl_mem),
&inputGLBufferPerQueue_[queueIndex]); // Input source
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
error_);
error_ = _wrapper->clSetKernelArg(
kernel_, 1, sizeof(cl_mem),
&outputGLBufferPerQueue_[queueIndex]); // Output glDest
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
error_);
error_ = _wrapper->clSetKernelArg(
kernel_, 2, sizeof(cl_mem),
&outputCLBufferPerQueue_[queueIndex]); // Output clDest
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
error_);
// Acquire input GL buffer
error_ = _wrapper->clEnqueueAcquireGLObjects(
deviceCmdQueues_[queueIndex], 1, &inputGLBufferPerQueue_[queueIndex], 0,
NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)",
error_);
// Acquire output GL buffer
error_ = _wrapper->clEnqueueAcquireGLObjects(
deviceCmdQueues_[queueIndex], 1, &outputGLBufferPerQueue_[queueIndex],
0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)",
error_);
// Enqueue the kernel
size_t gws[1] = {BUFFER_ELEMENTS_COUNT};
error_ =
_wrapper->clEnqueueNDRangeKernel(deviceCmdQueues_[queueIndex], kernel_,
1, NULL, gws, NULL, 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed (%d)",
error_);
// Release input GL buffer
error_ = _wrapper->clEnqueueReleaseGLObjects(
deviceCmdQueues_[queueIndex], 1, &inputGLBufferPerQueue_[queueIndex], 0,
NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS),
"clEnqueueReleaseGLObjects failed (%d)", error_);
// Release output GL buffer
error_ = _wrapper->clEnqueueReleaseGLObjects(
deviceCmdQueues_[queueIndex], 1, &outputGLBufferPerQueue_[queueIndex],
0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS),
"clEnqueueReleaseGLObjects failed (%d)", error_);
// Flush commands in order to trigger the operations
error_ = _wrapper->clFlush(deviceCmdQueues_[queueIndex]);
CHECK_RESULT((error_ != CL_SUCCESS), "clFlush() failed (%d)", error_);
}
for (int queueIndex = 0; queueIndex < QUEUES_PER_DEVICE_COUNT; queueIndex++) {
// Get the results from CL buffer (in a synchronous manner)
cl_uint4 outDataCL[BUFFER_ELEMENTS_COUNT];
error_ = _wrapper->clEnqueueReadBuffer(
deviceCmdQueues_[queueIndex], outputCLBufferPerQueue_[queueIndex],
CL_TRUE, 0, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4), outDataCL, 0,
NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "Unable to read output CL array! (%d)",
error_);
cl_uint4 outDataGL[BUFFER_ELEMENTS_COUNT] = {{{0}}};
glBindBuffer(GL_ARRAY_BUFFER, outGLBufferIDs_[queueIndex]); // why again
void* glMem = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY);
memcpy(outDataGL, glMem, BUFFER_ELEMENTS_COUNT * sizeof(cl_uint4));
glUnmapBuffer(GL_ARRAY_BUFFER);
cl_uint4 expectedCL = {{0}};
cl_uint4 expectedGL = {{0}};
// Check output
for (int i = 0; i < BUFFER_ELEMENTS_COUNT; ++i) {
// Calculate expected value in CL output buffer (input + 1)
expectedCL = inData[queueIndex][i];
expectedCL.s[0]++;
expectedCL.s[1]++;
expectedCL.s[2]++;
expectedCL.s[3]++;
// Calculate expected value in GL output buffer (input + 2)
expectedGL = inData[queueIndex][i];
expectedGL.s[0] += 2;
expectedGL.s[1] += 2;
expectedGL.s[2] += 2;
expectedGL.s[3] += 2;
// Compare expected output with actual data received
for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) {
CHECK_RESULT((outDataCL[i].s[j] != expectedCL.s[j]),
"Element %d in CL output buffer is incorrect!\n\t \
expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}",
i, expectedCL.s[0], expectedCL.s[1], expectedCL.s[2],
expectedCL.s[3], outDataCL[i].s[0], outDataCL[i].s[1],
outDataCL[i].s[2], outDataCL[i].s[3]);
CHECK_RESULT((outDataGL[i].s[j] != expectedGL.s[j]),
"Element %d in GL output buffer is incorrect!\n\t \
expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}",
i, expectedGL.s[0], expectedGL.s[1], expectedGL.s[2],
expectedGL.s[3], outDataGL[i].s[0], outDataGL[i].s[1],
outDataGL[i].s[2], outDataGL[i].s[3]);
}
}
}
}
unsigned int OCLGLBufferMultipleQueues::close(void) {
// Release cl buffers (must be done before releasing the associated GL
// buffers)
for (int bufferIndex = 0; bufferIndex < (int)inputGLBufferPerQueue_.size();
bufferIndex++) {
error_ = _wrapper->clReleaseMemObject(inputGLBufferPerQueue_[bufferIndex]);
CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
"clReleaseMemObject() failed");
}
for (int bufferIndex = 0; bufferIndex < (int)outputGLBufferPerQueue_.size();
bufferIndex++) {
error_ = _wrapper->clReleaseMemObject(outputGLBufferPerQueue_[bufferIndex]);
CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
"clReleaseMemObject() failed");
}
for (int bufferIndex = 0; bufferIndex < (int)outputCLBufferPerQueue_.size();
bufferIndex++) {
error_ = _wrapper->clReleaseMemObject(outputCLBufferPerQueue_[bufferIndex]);
CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
"clReleaseMemObject() failed");
}
// Delete GL in & out buffers
glBindBuffer(GL_ARRAY_BUFFER, 0);
if (!inGLBufferIDs_.empty()) {
glDeleteBuffers((int)inGLBufferIDs_.size(), &inGLBufferIDs_[0]);
}
if (!outGLBufferIDs_.empty()) {
glDeleteBuffers((int)outGLBufferIDs_.size(), &outGLBufferIDs_[0]);
}
// Release queues created by open method, the first queue per device is
// released by base class
for (int queueIndex = 1; queueIndex < (int)deviceCmdQueues_.size();
queueIndex++) {
error_ = _wrapper->clReleaseCommandQueue(deviceCmdQueues_[queueIndex]);
CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
"clReleaseCommandQueue() failed");
}
deviceCmdQueues_.clear();
return OCLGLCommon::close();
}
@@ -0,0 +1,48 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_GL_BUFFER_MULTIPLE_QUEUES_H_
#define _OCL_GL_BUFFER_MULTIPLE_QUEUES_H_
#include "OCLGLCommon.h"
class OCLGLBufferMultipleQueues : public OCLGLCommon {
public:
OCLGLBufferMultipleQueues();
virtual ~OCLGLBufferMultipleQueues();
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceId);
virtual void run(void);
virtual unsigned int close(void);
private:
static const int BUFFER_ELEMENTS_COUNT = 1024;
static const int QUEUES_PER_DEVICE_COUNT = 2;
std::vector<cl_command_queue>
deviceCmdQueues_; // Multiple queues per device (single device)
std::vector<cl_mem> inputGLBufferPerQueue_; // Input GL buffer per queue
std::vector<cl_mem> outputGLBufferPerQueue_; // Output GL buffer per queue
std::vector<cl_mem> outputCLBufferPerQueue_; // Input CL buffer per queue
std::vector<GLuint> inGLBufferIDs_; // Input GL buffers IDs
std::vector<GLuint> outGLBufferIDs_; // Output GL buffers IDs
};
#endif // _OCL_GL_BUFFER_MULTIPLE_QUEUES_H_
@@ -0,0 +1,270 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLGLDepthBuffer.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
const static char* strKernel =
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"__kernel void gldepths_test( __global float *output, read_only image2d_t "
"source, sampler_t sampler){ \n"
" int tidX = get_global_id(0);\n"
" int tidY = get_global_id(1);\n"
" float4 value = read_imagef( source, sampler, (int2)( tidX, tidY ) );\n"
" output[ tidY * get_image_width( source ) + tidX ] = value.z;\n"
"}\n";
OCLGLDepthBuffer::OCLGLDepthBuffer()
: glDepthBuffer_(0),
frameBufferOBJ_(0),
colorBuffer_(0),
clOutputBuffer_(0),
clDepth_(0),
clSampler_(0),
pGLOutput_(0),
pCLOutput_(0),
extensionSupported_(false) {
_numSubTests = 2;
_currentTest = 0;
}
OCLGLDepthBuffer::~OCLGLDepthBuffer() {}
void OCLGLDepthBuffer::open(unsigned int test, char* units, double& conversion,
unsigned int deviceId) {
OCLGLCommon::open(test, units, conversion, deviceId);
if (_errorFlag) return;
char* pExtensions = (char*)malloc(8192);
size_t returnSize;
_wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 8192,
pExtensions, &returnSize);
// if extension if not supported
if (!strstr(pExtensions, "cl_khr_gl_depth_images")) {
printf("skipping test depth interop not supported\n");
free(pExtensions);
return;
}
free(pExtensions);
extensionSupported_ = true;
_currentTest = test;
// Build the kernel
program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
&error_);
CHECK_RESULT((error_ != CL_SUCCESS),
"clCreateProgramWithSource() failed (%d)", error_);
error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
NULL, NULL);
if (error_ != CL_SUCCESS) {
char programLog[1024];
_wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
printf("\n%s\n", programLog);
fflush(stdout);
}
CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_);
kernel_ = _wrapper->clCreateKernel(program_, "gldepths_test", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_);
}
void OCLGLDepthBuffer::run(void) {
if (_errorFlag || !extensionSupported_) {
return;
}
bool retVal;
switch (_currentTest) {
case 0:
retVal = testDepthRead(GL_DEPTH_COMPONENT32F, GL_DEPTH_ATTACHMENT);
break;
case 1:
retVal = testDepthRead(GL_DEPTH_COMPONENT16, GL_DEPTH_ATTACHMENT);
break;
case 2:
retVal = testDepthRead(GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL_ATTACHMENT);
break;
case 3:
retVal = testDepthRead(GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL_ATTACHMENT);
break;
default:
CHECK_RESULT(true, "unsupported test number\n");
}
CHECK_RESULT((retVal != true), "cl-gl depth test failed ");
}
bool OCLGLDepthBuffer::testDepthRead(GLint internalFormat,
GLenum attachmentType) {
cl_int error;
size_t dimSizes[] = {c_dimSize, c_dimSize};
unsigned int bufferSize = c_dimSize * c_dimSize * 4;
bool retVal = false;
pGLOutput_ = (float*)malloc(bufferSize);
pCLOutput_ = (float*)malloc(bufferSize);
// create Frame buffer object
glGenFramebuffers(1, &frameBufferOBJ_);
// create textures
glGenTextures(1, &colorBuffer_);
glEnable(GL_TEXTURE_2D);
glBindTexture(GL_TEXTURE_2D, colorBuffer_);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, c_dimSize, c_dimSize, 0, GL_RGBA,
GL_UNSIGNED_BYTE, 0);
glBindTexture(GL_TEXTURE_2D, 0);
// create a renderbuffer for the depth/stencil buffer
glGenRenderbuffers(1, &glDepthBuffer_);
glBindRenderbuffer(GL_RENDERBUFFER, glDepthBuffer_);
glRenderbufferStorage(GL_RENDERBUFFER, internalFormat, c_dimSize, c_dimSize);
//
glBindFramebuffer(GL_FRAMEBUFFER, frameBufferOBJ_);
glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, colorBuffer_, 0);
glFramebufferRenderbuffer(GL_FRAMEBUFFER, attachmentType, GL_RENDERBUFFER,
glDepthBuffer_);
GLenum status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
if (GL_FRAMEBUFFER_COMPLETE != status) {
return false;
}
// set up gl state machine
glViewport(0, 0, c_dimSize, c_dimSize); // Reset The Current Viewport
glMatrixMode(GL_PROJECTION); // Select The Projection Matrix
glLoadIdentity(); // Reset The Projection Matrix
gluPerspective(30.0f, (GLfloat)c_dimSize / (GLfloat)c_dimSize, 0.1f, 100.0f);
glMatrixMode(GL_MODELVIEW); // Select The Modelview Matrix
glLoadIdentity();
glEnable(GL_DEPTH_TEST);
// The Type Of Depth Testing To Do
glClear(GL_COLOR_BUFFER_BIT |
GL_DEPTH_BUFFER_BIT); // Clear Screen And Depth Buffer
glBegin(GL_QUADS); // Draw A Quad
glVertex3f(-1.0f, 1.0f, -6.0f); // Top Left
glVertex3f(1.0f, 1.0f, -6.0f); // Top Right
glVertex3f(1.0f, -1.0f, -3.0f); // Bottom Right
glVertex3f(-1.0f, -1.0f, -3.0f); // Bottom Left
glEnd();
glFinish();
clDepth_ = _wrapper->clCreateFromGLRenderbuffer(context_, CL_MEM_READ_WRITE,
glDepthBuffer_, &error);
if (CL_SUCCESS != error) {
printf("clCreateFromGLRenderbuffer failed\n");
return false;
}
clOutputBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
bufferSize, NULL, &error);
if (CL_SUCCESS != error) return false;
clSampler_ = _wrapper->clCreateSampler(context_, CL_FALSE, CL_ADDRESS_NONE,
CL_FILTER_NEAREST, &error);
if (CL_SUCCESS != error) return false;
error = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 1,
&clDepth_, 0, NULL, NULL);
_wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clOutputBuffer_);
_wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), &clDepth_);
_wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_sampler), &clSampler_);
_wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, NULL,
dimSizes, NULL, 0, NULL, NULL);
_wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 1, &clDepth_, 0,
NULL, NULL);
_wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], clOutputBuffer_, CL_TRUE,
0, bufferSize, pCLOutput_, 0, NULL, NULL);
glReadPixels(0, 0, c_dimSize, c_dimSize, GL_DEPTH_COMPONENT, GL_FLOAT,
pGLOutput_);
// test that both resources are identical.
if (0 == memcmp(pGLOutput_, pCLOutput_, bufferSize)) {
retVal = true; // test successful
} else {
printf("expected results is different from actual results\n");
dumpBuffer(pGLOutput_, "GLDepth.csv", c_dimSize);
dumpBuffer(pCLOutput_, "CLDepth.csv", c_dimSize);
}
return retVal;
}
unsigned int OCLGLDepthBuffer::close(void) {
if (pGLOutput_) {
free(pGLOutput_);
pGLOutput_ = NULL;
}
if (pCLOutput_) {
free(pCLOutput_);
pCLOutput_ = NULL;
}
clReleaseMemObject(clDepth_);
clReleaseMemObject(clOutputBuffer_);
clReleaseSampler(clSampler_);
// unbind the texture and frame buffer.
glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, 0, 0);
glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, 0, 0);
glBindFramebuffer(GL_FRAMEBUFFER, 0);
// clean gl resources
glDeleteFramebuffers(1, &frameBufferOBJ_);
frameBufferOBJ_ = 0;
glDeleteTextures(1, &colorBuffer_);
colorBuffer_ = 0;
glDeleteTextures(1, &glDepthBuffer_);
glDepthBuffer_ = 0;
return OCLGLCommon::close();
}
// helper functions
unsigned int OCLGLDepthBuffer::formatToSize(GLint internalFormat) {
switch (internalFormat) {
case GL_DEPTH_COMPONENT32F:
return 4;
break;
case GL_DEPTH_COMPONENT16:
return 2;
break;
case GL_DEPTH24_STENCIL8:
return 4;
break;
case GL_DEPTH32F_STENCIL8:
return 8;
break;
default:
return 0;
}
}
@@ -0,0 +1,66 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_GL_DEPTH_BUFFER_H_
#define _OCL_GL_DEPTH_BUFFER_H_
#include "OCLGLCommon.h"
class OCLGLDepthBuffer : public OCLGLCommon {
public:
OCLGLDepthBuffer();
virtual ~OCLGLDepthBuffer();
static const unsigned int c_dimSize = 128;
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceId);
virtual void run(void);
virtual unsigned int close(void);
private:
////////////////////
// test functions //
////////////////////
bool testDepthRead(GLint internalFormat, GLenum attachmentType);
unsigned int _currentTest;
/////////////////////
// private members //
/////////////////////
// GL resource identifiers
GLuint glDepthBuffer_;
GLuint frameBufferOBJ_;
GLuint colorBuffer_;
// CL identifiers
cl_mem clOutputBuffer_;
cl_mem clDepth_;
cl_sampler clSampler_;
// pointers to buffers
float* pGLOutput_;
float* pCLOutput_;
bool extensionSupported_;
//////////////////////////////
// private helper functions //
//////////////////////////////
// returns element size in bytes.
static unsigned int formatToSize(GLint internalFormat);
};
#endif // _OCL_GL_BUFFER_H_
@@ -0,0 +1,278 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLGLDepthTex.h"
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
const static char* strKernel =
"__kernel void gldepths_test( __global float *output, read_only image2d_t "
"source, sampler_t sampler){ \n"
" int tidX = get_global_id(0);\n"
" int tidY = get_global_id(1);\n"
" float4 value = read_imagef( source, sampler, (int2)( tidX, tidY ) );\n"
" output[ tidY * get_image_width( source ) + tidX ] = value.z;\n"
"}\n";
OCLGLDepthTex::OCLGLDepthTex()
: glDepthBuffer_(0),
frameBufferOBJ_(0),
colorBuffer_(0),
clOutputBuffer_(0),
clDepth_(0),
clSampler_(0),
pGLOutput_(0),
pCLOutput_(0),
extensionSupported_(false) {
_numSubTests = 8;
_currentTest = 0;
}
OCLGLDepthTex::~OCLGLDepthTex() {}
void OCLGLDepthTex::open(unsigned int test, char* units, double& conversion,
unsigned int deviceId) {
OCLGLCommon::open(test, units, conversion, deviceId);
if (_errorFlag) return;
char* pExtensions = (char*)malloc(8192);
size_t returnSize;
_wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_EXTENSIONS, 8192,
pExtensions, &returnSize);
// if extension if not supported
if (!strstr(pExtensions, "cl_khr_gl_depth_images")) {
free(pExtensions);
printf("skipping test depth interop not supported\n");
return;
}
free(pExtensions);
extensionSupported_ = true;
static const char* OpenCL20Kernel = "-cl-std=CL2.0";
const char* options = OpenCL20Kernel;
if (test < 4) {
options = NULL;
}
_currentTest = test % 4;
// Build the kernel
program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
&error_);
CHECK_RESULT((error_ != CL_SUCCESS),
"clCreateProgramWithSource() failed (%d)", error_);
error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], options,
NULL, NULL);
if (error_ != CL_SUCCESS) {
char programLog[1024];
_wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
printf("\n%s\n", programLog);
fflush(stdout);
}
CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_);
kernel_ = _wrapper->clCreateKernel(program_, "gldepths_test", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_);
}
void OCLGLDepthTex::run(void) {
if (_errorFlag || !extensionSupported_) {
return;
}
bool retVal;
switch (_currentTest) {
case 0:
retVal = testDepthRead(GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL,
GL_UNSIGNED_INT_24_8);
break;
case 1:
retVal =
testDepthRead(GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_FLOAT);
break;
case 2:
retVal =
testDepthRead(GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT);
break;
case 3:
retVal = testDepthRead(GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL,
GL_FLOAT_32_UNSIGNED_INT_24_8_REV);
break;
default:
CHECK_RESULT(true, "unsupported test number\n");
}
CHECK_RESULT((retVal != true), "cl-gl depth test failed ");
}
bool OCLGLDepthTex::testDepthRead(GLint internalFormat, GLenum format,
GLenum type) {
const unsigned int bufferSize = c_dimSize * c_dimSize * 4;
pGLOutput_ = (float*)malloc(bufferSize);
pCLOutput_ = (float*)malloc(bufferSize);
size_t dimSizes[] = {c_dimSize, c_dimSize};
bool retVal = false;
// create Frame buffer object
glGenFramebuffers(1, &frameBufferOBJ_);
glBindFramebuffer(GL_FRAMEBUFFER, frameBufferOBJ_);
// create textures
glGenTextures(1, &colorBuffer_);
glBindTexture(GL_TEXTURE_2D, colorBuffer_);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, c_dimSize, c_dimSize, 0, GL_RGBA,
GL_UNSIGNED_BYTE, 0);
glGenTextures(1, &glDepthBuffer_);
glBindTexture(GL_TEXTURE_2D, glDepthBuffer_);
glTexImage2D(GL_TEXTURE_2D, 0, internalFormat, c_dimSize, c_dimSize, 0,
format, type, 0);
GLint glError = glGetError();
//
glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, colorBuffer_, 0);
if (GL_DEPTH_COMPONENT == format) {
glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, glDepthBuffer_,
0);
} else {
glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT,
glDepthBuffer_, 0);
}
glBindFramebuffer(GL_FRAMEBUFFER, frameBufferOBJ_);
GLenum status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
if (GL_FRAMEBUFFER_COMPLETE != status) {
printf("frame buffer incomplete!\n");
return false;
}
// set up gl state machine
glViewport(0, 0, c_dimSize, c_dimSize); // Reset The Current Viewport
glMatrixMode(GL_PROJECTION); // Select The Projection Matrix
glLoadIdentity(); // Reset The Projection Matrix
gluPerspective(30.0f, (GLfloat)c_dimSize / (GLfloat)c_dimSize, 0.1f, 100.0f);
glMatrixMode(GL_MODELVIEW); // Select The Modelview Matrix
glLoadIdentity();
glEnable(GL_DEPTH_TEST);
glBindFramebuffer(GL_FRAMEBUFFER, frameBufferOBJ_);
cl_int error;
clOutputBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
bufferSize, NULL, &error);
if (CL_SUCCESS != error) return false;
clSampler_ = _wrapper->clCreateSampler(context_, CL_FALSE, CL_ADDRESS_NONE,
CL_FILTER_NEAREST, &error);
if (CL_SUCCESS != error) return false;
clDepth_ = _wrapper->clCreateFromGLTexture(
context_, CL_MEM_READ_ONLY, GL_TEXTURE_2D, 0, glDepthBuffer_, &error);
if (CL_SUCCESS != error) return false;
for (int i = 0; i < 3; ++i) {
// The Type Of Depth Testing To Do
glClear(GL_COLOR_BUFFER_BIT |
GL_DEPTH_BUFFER_BIT); // Clear Screen And Depth Buffer
const float zValues[3][2] = {
{-6.f, -3.f},
{-5.f, -2.f},
{-4.f, -1.f},
};
glBegin(GL_QUADS); // Draw A Quad
glVertex3f(-1.0f, 1.0f, zValues[i][0]); // Top Left
glVertex3f(1.0f, 1.0f, zValues[i][0]); // Top Right
glVertex3f(1.0f, -1.0f, zValues[i][1]); // Bottom Right
glVertex3f(-1.0f, -1.0f, zValues[i][1]); // Bottom Left
glEnd();
glFinish();
error = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 1,
&clDepth_, 0, NULL, NULL);
_wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clOutputBuffer_);
_wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), &clDepth_);
_wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_sampler), &clSampler_);
_wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, NULL,
dimSizes, NULL, 0, NULL, NULL);
_wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 1, &clDepth_, 0,
NULL, NULL);
_wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], clOutputBuffer_,
CL_TRUE, 0, bufferSize, pCLOutput_, 0, NULL,
NULL);
glReadPixels(0, 0, c_dimSize, c_dimSize, GL_DEPTH_COMPONENT, GL_FLOAT,
pGLOutput_);
// test that both resources are identical.
if (0 == memcmp(pGLOutput_, pCLOutput_, bufferSize)) {
retVal = true; // test successful
} else {
printf("expected results is different from actual results\n");
dumpBuffer(pGLOutput_, "GLDepth.csv", c_dimSize);
dumpBuffer(pCLOutput_, "clDepth_.csv", c_dimSize);
}
}
return retVal;
}
unsigned int OCLGLDepthTex::close(void) {
if (pGLOutput_) {
free(pGLOutput_);
pGLOutput_ = NULL;
}
if (pCLOutput_) {
free(pCLOutput_);
pCLOutput_ = NULL;
}
clReleaseMemObject(clDepth_);
clReleaseMemObject(clOutputBuffer_);
clReleaseSampler(clSampler_);
// unbind the texture and frame buffer.
glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, 0, 0);
glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, 0, 0);
glBindFramebuffer(GL_FRAMEBUFFER, 0);
// clean gl resources
glDeleteFramebuffers(1, &frameBufferOBJ_);
frameBufferOBJ_ = 0;
glDeleteTextures(1, &colorBuffer_);
colorBuffer_ = 0;
glDeleteTextures(1, &glDepthBuffer_);
glDepthBuffer_ = 0;
return OCLGLCommon::close();
}
@@ -0,0 +1,62 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_GL_DEPTH_TEX_H_
#define _OCL_GL_DEPTH_TEX_H_
#include "OCLGLCommon.h"
class OCLGLDepthTex : public OCLGLCommon {
public:
OCLGLDepthTex();
virtual ~OCLGLDepthTex();
static const unsigned int c_dimSize = 128;
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceId);
virtual void run(void);
virtual unsigned int close(void);
private:
////////////////////
// test functions //
////////////////////
bool testDepthRead(GLint internalFormat, GLenum format, GLenum type);
unsigned int _currentTest;
/////////////////////
// private members //
/////////////////////
// GL resource identifiers
GLuint glDepthBuffer_;
GLuint frameBufferOBJ_;
GLuint colorBuffer_;
// CL identifiers
cl_mem clOutputBuffer_;
cl_mem clDepth_;
cl_sampler clSampler_;
// pointers to buffers
float* pGLOutput_;
float* pCLOutput_;
bool extensionSupported_;
};
#endif // _OCL_GL_BUFFER_H_
@@ -0,0 +1,481 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLGLFenceSync.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include "Timer.h"
#ifndef WIN_OS
#include <GL/glx.h>
#endif
const static char *strKernel =
"__kernel void glmulticontext_test( __global uint4 *source, __global uint4 "
"*dest) \n"
"{ "
" \n"
" int tid = get_global_id(0); "
" \n"
" dest[ tid ] = source [ tid ] + (uint4)(1); "
" \n"
"} "
" \n";
OCLGLFenceSync::OCLGLFenceSync() {
memset(contextData_, 0, sizeof(contextData_));
_numSubTests = 2;
}
OCLGLFenceSync::~OCLGLFenceSync() {}
#ifdef WIN_OS
typedef GLsync(__stdcall *glFenceSyncPtr)(GLenum condition, GLbitfield flags);
typedef bool(__stdcall *glIsSyncPtr)(GLsync sync);
typedef void(__stdcall *glDeleteSyncPtr)(GLsync sync);
typedef GLenum(__stdcall *glClientWaitSyncPtr)(GLsync sync, GLbitfield flags,
GLuint64 timeout);
typedef void(__stdcall *glWaitSyncPtr)(GLsync sync, GLbitfield flags,
GLuint64 timeout);
typedef void(__stdcall *glGetInteger64vPtr)(GLenum pname, GLint64 *params);
typedef void(__stdcall *glGetSyncivPtr)(GLsync sync, GLenum pname,
GLsizei bufSize, GLsizei *length,
GLint *values);
#else
typedef GLsync (*glFenceSyncPtr)(GLenum condition, GLbitfield flags);
typedef bool (*glIsSyncPtr)(GLsync sync);
typedef void (*glDeleteSyncPtr)(GLsync sync);
typedef GLenum (*glClientWaitSyncPtr)(GLsync sync, GLbitfield flags,
GLuint64 timeout);
typedef void (*glWaitSyncPtr)(GLsync sync, GLbitfield flags, GLuint64 timeout);
typedef void (*glGetInteger64vPtr)(GLenum pname, GLint64 *params);
typedef void (*glGetSyncivPtr)(GLsync sync, GLenum pname, GLsizei bufSize,
GLsizei *length, GLint *values);
#endif
typedef struct __GLsync *GLsync;
glFenceSyncPtr glFenceSyncFunc;
glIsSyncPtr glIsSyncFunc;
glDeleteSyncPtr glDeleteSyncFunc;
glClientWaitSyncPtr glClientWaitSyncFunc;
glWaitSyncPtr glWaitSyncFunc;
glGetInteger64vPtr glGetInteger64vFunc;
glGetSyncivPtr glGetSyncivFunc;
#define CHK_GL_ERR() printf("%s\n", gluErrorString(glGetError()))
#define cl_khr_gl_event 1
static void InitSyncFns() {
#ifdef WIN_OS
glFenceSyncFunc = (glFenceSyncPtr)wglGetProcAddress("glFenceSync");
glIsSyncFunc = (glIsSyncPtr)wglGetProcAddress("glIsSync");
glDeleteSyncFunc = (glDeleteSyncPtr)wglGetProcAddress("glDeleteSync");
glClientWaitSyncFunc =
(glClientWaitSyncPtr)wglGetProcAddress("glClientWaitSync");
glWaitSyncFunc = (glWaitSyncPtr)wglGetProcAddress("glWaitSync");
glGetInteger64vFunc =
(glGetInteger64vPtr)wglGetProcAddress("glGetInteger64v");
glGetSyncivFunc = (glGetSyncivPtr)wglGetProcAddress("glGetSynciv");
#else
glFenceSyncFunc = (glFenceSyncPtr)glXGetProcAddress((GLubyte *)"glFenceSync");
glIsSyncFunc = (glIsSyncPtr)glXGetProcAddress((GLubyte *)"glIsSync");
glDeleteSyncFunc =
(glDeleteSyncPtr)glXGetProcAddress((GLubyte *)"glDeleteSync");
glClientWaitSyncFunc =
(glClientWaitSyncPtr)glXGetProcAddress((GLubyte *)"glClientWaitSync");
glWaitSyncFunc = (glWaitSyncPtr)glXGetProcAddress((GLubyte *)"glWaitSync");
glGetInteger64vFunc =
(glGetInteger64vPtr)glXGetProcAddress((GLubyte *)"glGetInteger64v");
glGetSyncivFunc = (glGetSyncivPtr)glXGetProcAddress((GLubyte *)"glGetSynciv");
#endif
}
#define USING_ARB_sync 1
typedef cl_event(CL_API_CALL *clCreateEventFromGLsyncKHR_fn)(
cl_context context, GLsync sync, cl_int *errCode_ret);
clCreateEventFromGLsyncKHR_fn clCreateEventFromGLsyncKHR_ptr;
/* Helper to determine if an extension is supported by a device */
int is_extension_available(cl_device_id device, const char *extensionName) {
char *extString;
size_t size = 0;
int err;
int result = -1;
if ((err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &size))) {
printf(
"Error: failed to determine size of device extensions string (err = "
"%d)\n",
err);
return -2;
}
if (0 == size) return -3;
extString = (char *)malloc(size);
if (NULL == extString) {
printf(
"Error: unable to allocate %ld byte buffer for extension string (err = "
"%d)\n",
(long)size, err);
return -40;
}
if ((err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, size, extString,
NULL))) {
printf("Error: failed to obtain device extensions string (err = %d)\n",
err);
free(extString);
return -5;
}
if (strstr(extString, extensionName)) result = 0;
free(extString);
return result;
}
void OCLGLFenceSync::open(unsigned int test, char *units, double &conversion,
unsigned int deviceId) {
_openTest = test;
// Initialize random number seed
srand((unsigned int)time(NULL));
OCLGLCommon::open(test, units, conversion, deviceId);
if (_errorFlag) return;
cl_context_properties properties[7] = {0};
for (unsigned int i = 0; i < c_glContextCount; i++) {
error_ = is_extension_available(devices_[_deviceId], "cl_khr_gl_event");
if (error_ != CL_SUCCESS) {
printf("Silent failure: cl_khr_gl_event extension not available (%d)\n",
error_);
extensionSupported_ = false;
return;
}
extensionSupported_ = true;
createGLContext(contextData_[i].glContext);
getCLContextPropertiesFromGLContext(contextData_[i].glContext, properties);
// Create new CL context from GL context
contextData_[i].clContext = _wrapper->clCreateContext(
properties, 1, &devices_[_deviceId], NULL, NULL, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext() failed (%d)",
error_);
// Create command queue for new context
contextData_[i].clCmdQueue = _wrapper->clCreateCommandQueue(
contextData_[i].clContext, devices_[_deviceId], 0, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed (%d)",
error_);
// Build the kernel
contextData_[i].clProgram = _wrapper->clCreateProgramWithSource(
contextData_[i].clContext, 1, &strKernel, NULL, &error_);
CHECK_RESULT((error_ != CL_SUCCESS),
"clCreateProgramWithSource() failed (%d)", error_);
error_ = _wrapper->clBuildProgram(contextData_[i].clProgram, 1,
&devices_[deviceId], NULL, NULL, NULL);
if (error_ != CL_SUCCESS) {
char programLog[1024];
_wrapper->clGetProgramBuildInfo(contextData_[i].clProgram,
devices_[deviceId], CL_PROGRAM_BUILD_LOG,
1024, programLog, 0);
printf("\n%s\n", programLog);
fflush(stdout);
}
CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)",
error_);
contextData_[i].clKernel = _wrapper->clCreateKernel(
contextData_[i].clProgram, "glmulticontext_test", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)",
error_);
}
}
void OCLGLFenceSync::run() {
if (_errorFlag || !extensionSupported_) {
return;
}
CPerfCounter timer;
double sec;
float perf;
cl_uint4 inOutData[c_numOfElements] = {{{0}}};
cl_uint4 expectedData[c_numOfElements] = {{{0}}};
unsigned int m = sizeof(cl_uint4) / sizeof(cl_uint);
int count = 0;
// Initialize input data with random values
for (unsigned int i = 0; i < c_numOfElements; i++) {
for (unsigned int j = 0; j < m; j++) {
inOutData[i].s[j] = (unsigned int)i;
expectedData[i].s[j] = inOutData[i].s[j] + c_glContextCount;
}
}
cl_event fenceEvent0 = NULL, fenceEvent = NULL;
GLsync glFence0 = NULL, glFence = NULL;
InitSyncFns();
clCreateEventFromGLsyncKHR_ptr =
(clCreateEventFromGLsyncKHR_fn)clGetExtensionFunctionAddress(
"clCreateEventFromGLsyncKHR");
if (clCreateEventFromGLsyncKHR_ptr == NULL) {
printf(
"ERROR: Unable to run fence_sync test (clCreateEventFromGLsyncKHR "
"function not discovered!)\n");
return;
}
for (unsigned int i = 0; i < c_glContextCount; i++) {
makeCurrent(contextData_[i].glContext);
// Generate and Bind in & out OpenGL buffers
GLuint inGLBuffer = 0, outGLBuffer = 0;
glGenBuffers(1, &inGLBuffer);
glGenBuffers(1, &outGLBuffer);
glBindBuffer(GL_ARRAY_BUFFER, inGLBuffer);
glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), inOutData,
GL_STATIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer);
glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), NULL,
GL_STATIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, 0);
glFinish();
// Checking if clWaitForEvents works
switch (_openTest) {
case 0: // Using fence sync
glFence0 = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
CHECK_RESULT((glFence0 == NULL), "Unable to create GL fence");
fenceEvent0 = clCreateEventFromGLsyncKHR_ptr(contextData_[i].clContext,
glFence0, &error_);
CHECK_RESULT((error_ != CL_SUCCESS),
"Unable to create CL event from GL fence (%d)", error_);
error_ = clWaitForEvents(1, &fenceEvent0);
CHECK_RESULT((error_ != CL_SUCCESS), "clWaitForEvents() failed (%d)",
error_);
break;
default:
glFinish();
break;
}
if (fenceEvent != NULL) {
clReleaseEvent(fenceEvent0);
glDeleteSync(glFence0);
}
cl_event acqEvent1 = 0, acqEvent2 = 0, kernelEvent = 0, relEvent1 = 0,
relEvent2 = 0;
// Create input buffer from GL input buffer
contextData_[i].inputBuffer = _wrapper->clCreateFromGLBuffer(
contextData_[i].clContext, CL_MEM_READ_ONLY, inGLBuffer, &error_);
CHECK_RESULT((error_ != CL_SUCCESS),
"Unable to create input GL buffer (%d)", error_);
// Create output buffer from GL output buffer
contextData_[i].outputBuffer = _wrapper->clCreateFromGLBuffer(
contextData_[i].clContext, CL_MEM_WRITE_ONLY, outGLBuffer, &error_);
CHECK_RESULT((error_ != CL_SUCCESS),
"Unable to create output GL buffer (%d)", error_);
timer.Reset();
switch (_openTest) {
case 0: // Using fence sync
timer.Start();
glFence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
timer.Stop();
CHECK_RESULT((glFence == NULL), "Unable to create GL fence");
timer.Start();
fenceEvent = clCreateEventFromGLsyncKHR_ptr(contextData_[i].clContext,
glFence, &error_);
timer.Stop();
CHECK_RESULT((error_ != CL_SUCCESS),
"Unable to create CL event from GL fence (%d)", error_);
break;
default:
break;
}
error_ =
_wrapper->clSetKernelArg(contextData_[i].clKernel, 0, sizeof(cl_mem),
&(contextData_[i].inputBuffer));
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
error_);
error_ =
_wrapper->clSetKernelArg(contextData_[i].clKernel, 1, sizeof(cl_mem),
&(contextData_[i].outputBuffer));
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
error_);
switch (_openTest) {
case 0: // Using fence sync
timer.Start();
error_ = _wrapper->clEnqueueAcquireGLObjects(
contextData_[i].clCmdQueue, 1, &(contextData_[i].inputBuffer), 1,
&fenceEvent, &acqEvent1);
timer.Stop();
CHECK_RESULT((error_ != CL_SUCCESS),
"Unable to acquire GL objects (%d)", error_);
timer.Start();
error_ = _wrapper->clEnqueueAcquireGLObjects(
contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 1,
&fenceEvent, &acqEvent2);
timer.Stop();
CHECK_RESULT((error_ != CL_SUCCESS),
"Unable to acquire GL objects (%d)", error_);
break;
case 1: // Using glFinish
timer.Start();
glFinish();
timer.Stop();
timer.Start();
error_ = _wrapper->clEnqueueAcquireGLObjects(
contextData_[i].clCmdQueue, 1, &(contextData_[i].inputBuffer), 0,
NULL, &acqEvent1);
timer.Stop();
CHECK_RESULT((error_ != CL_SUCCESS),
"Unable to acquire GL objects (%d)", error_);
timer.Start();
error_ = _wrapper->clEnqueueAcquireGLObjects(
contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 0,
NULL, &acqEvent2);
timer.Stop();
CHECK_RESULT((error_ != CL_SUCCESS),
"Unable to acquire GL objects (%d)", error_);
break;
default:
break;
}
size_t gws[1] = {c_numOfElements};
cl_event evts[2] = {acqEvent1, acqEvent2};
error_ = _wrapper->clEnqueueNDRangeKernel(contextData_[i].clCmdQueue,
contextData_[i].clKernel, 1, NULL,
gws, NULL, 2, evts, &kernelEvent);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed (%d)",
error_);
error_ = _wrapper->clEnqueueReleaseGLObjects(contextData_[i].clCmdQueue, 1,
&(contextData_[i].inputBuffer),
1, &kernelEvent, &relEvent1);
CHECK_RESULT((error_ != CL_SUCCESS),
"clEnqueueReleaseGLObjects failed (%d)", error_);
error_ = _wrapper->clEnqueueReleaseGLObjects(
contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 1,
&kernelEvent, &relEvent2);
CHECK_RESULT((error_ != CL_SUCCESS),
"clEnqueueReleaseGLObjects failed (%d)", error_);
evts[0] = relEvent1;
evts[1] = relEvent2;
error_ = clWaitForEvents(2, evts);
CHECK_RESULT((error_ != CL_SUCCESS), "clWaitForEvents() failed (%d)",
error_);
glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer);
void *glMem = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY);
memcpy(inOutData, glMem, c_numOfElements * sizeof(cl_uint4));
glUnmapBuffer(GL_ARRAY_BUFFER);
_wrapper->clReleaseMemObject(contextData_[i].inputBuffer);
_wrapper->clReleaseMemObject(contextData_[i].outputBuffer);
// Delete GL buffers
glBindBuffer(GL_ARRAY_BUFFER, 0);
glDeleteBuffers(1, &inGLBuffer);
inGLBuffer = 0;
glDeleteBuffers(1, &outGLBuffer);
outGLBuffer = 0;
}
sec = timer.GetElapsedTime();
perf = (float)sec * 1000000; // in microseconds
_perfInfo = (float)perf;
if (fenceEvent != NULL) {
clReleaseEvent(fenceEvent);
glDeleteSync(glFence);
}
// Compare expected output with actual data received
for (unsigned int i = 0; i < c_numOfElements; i++) {
for (unsigned int j = 0; j < m; j++) {
if (inOutData[i].s[j] != expectedData[i].s[j]) {
printf(
"Element %u is incorrect!\t expected:[ %u, %u, %u, %u ] differs "
"from actual:{%u, %u, %u, %u}\n",
i, expectedData[i].s[0], expectedData[i].s[1], expectedData[i].s[2],
expectedData[i].s[3], inOutData[i].s[0], inOutData[i].s[1],
inOutData[i].s[2], inOutData[i].s[3]);
count++;
}
}
}
if (count) printf("Number of elements wrong: %d\n", count);
}
unsigned int OCLGLFenceSync::close() {
error_ = is_extension_available(devices_[_deviceId], "cl_khr_gl_event");
if (error_ == CL_SUCCESS) {
for (unsigned int i = 0; i < c_glContextCount; i++) {
makeCurrent(contextData_[i].glContext);
_wrapper->clReleaseKernel(contextData_[i].clKernel);
_wrapper->clReleaseProgram(contextData_[i].clProgram);
_wrapper->clReleaseCommandQueue(contextData_[i].clCmdQueue);
_wrapper->clReleaseContext(contextData_[i].clContext);
destroyGLContext(contextData_[i].glContext);
}
}
return OCLGLCommon::close();
}
@@ -0,0 +1,55 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_GL_FENCE_SYNC_H_
#define _OCL_GL_FENCE_SYNC_H_
#include "OCLGLCommon.h"
class OCLGLFenceSync : public OCLGLCommon {
public:
OCLGLFenceSync();
virtual ~OCLGLFenceSync();
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceId);
virtual void run(void);
virtual unsigned int close(void);
private:
static const unsigned int c_glContextCount = 1;
static const unsigned int c_numOfElements = 8192;
struct GLContextDataSet {
OCLGLHandle glContext;
cl_context clContext;
cl_command_queue clCmdQueue;
cl_program clProgram;
cl_kernel clKernel;
cl_mem inputBuffer;
cl_mem outputBuffer;
};
GLContextDataSet contextData_[c_glContextCount];
bool failed_;
bool extensionSupported_;
};
#endif // _OCL_GL_FENCE_SYNC_H_
@@ -0,0 +1,298 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLGLMsaaTexture.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
const static char* strKernel =
"__kernel void gl_msaa_test( __global uint4 *output, read_only "
"image2d_msaa_t source, unsigned int numSamples){ \n"
" int tidX = get_global_id(0);\n"
" int tidY = get_global_id(1);\n"
" for (int i = 0 ; i < numSamples ; i++) {\n"
" uint4 value = read_imageui( source, (int2)( tidX, tidY ) ,i);\n"
" int index = (tidY * get_image_width( source ) + tidX)*numSamples + "
"i;\n"
" output[ index ] = value;\n"
" }\n"
"}\n";
const static char* glDownSampleShader =
"uniform sampler2DMS MsaaTex;\n"
"uniform int numSamples;\n"
"uniform ivec2 resolution;\n"
"\n"
"varying vec4 gl_TexCoord[ ]; \n"
"\n"
"void main(void)\n"
"{\n"
" vec4 accum = vec4(0.0,0.0,0.0,0.0);\n"
" ivec2 coord = ivec2(resolution * gl_TexCoord[0].xy) ;\n"
" for ( int i = 0 ; i < numSamples ; i++)\n"
" {\n"
" accum += texelFetch(MsaaTex,coord,i);\n"
" }\n"
" accum /= numSamples;\n"
" \n"
" \n"
" \n"
" gl_FragColor = accum;\n"
"}";
OCLGLMsaaTexture::OCLGLMsaaTexture()
: msaaDepthBuffer_(0),
msaaFrameBufferOBJ_(0),
msaaColorBuffer_(0),
glShader_(0),
glprogram_(0),
clOutputBuffer_(0),
clMsaa_(0),
pGLOutput_(0),
pCLOutput_(0) {
_numSubTests = 1;
_currentTest = 0;
}
OCLGLMsaaTexture::~OCLGLMsaaTexture() {}
void OCLGLMsaaTexture::open(unsigned int test, char* units, double& conversion,
unsigned int deviceId) {
OCLGLCommon::open(test, units, conversion, deviceId);
if (_errorFlag) return;
_currentTest = test;
// Build the kernel
program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
&error_);
CHECK_RESULT((error_ != CL_SUCCESS),
"clCreateProgramWithSource() failed (%d)", error_);
error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
NULL, NULL);
if (error_ != CL_SUCCESS) {
char programLog[1024];
_wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
printf("\n%s\n", programLog);
fflush(stdout);
}
CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_);
kernel_ = _wrapper->clCreateKernel(program_, "gl_msaa_test", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_);
}
void OCLGLMsaaTexture::run(void) {
if (_errorFlag) {
return;
}
bool retVal;
switch (_currentTest) {
case 0:
retVal = testMsaaRead(GL_RGBA, 2);
break;
default:
CHECK_RESULT(true, "unsupported test number\n");
}
CHECK_RESULT((retVal != true), "cl-gl depth test failed ");
}
unsigned int OCLGLMsaaTexture::close(void) {
if (pGLOutput_) {
free(pGLOutput_);
pGLOutput_ = NULL;
}
if (pCLOutput_) {
free(pCLOutput_);
pCLOutput_ = NULL;
}
clReleaseMemObject(clMsaa_);
clReleaseMemObject(clOutputBuffer_);
glFinish();
// unbind the texture and frame buffer.
glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, 0, 0);
glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, 0, 0);
glBindFramebuffer(GL_FRAMEBUFFER, 0);
glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, 0);
// clean gl resources
glDeleteFramebuffers(1, &msaaFrameBufferOBJ_);
msaaFrameBufferOBJ_ = 0;
glDeleteTextures(1, &msaaColorBuffer_);
msaaColorBuffer_ = 0;
glDeleteTextures(1, &msaaDepthBuffer_);
msaaDepthBuffer_ = 0;
glDeleteProgram(glprogram_);
glDeleteShader(glShader_);
return OCLGLCommon::close();
}
bool OCLGLMsaaTexture::testMsaaRead(GLint internalFormat,
unsigned int numSamples) {
size_t dimSizes[] = {c_dimSize, c_dimSize};
unsigned int bufferSize = c_dimSize * c_dimSize * 4;
bool retVal = false;
createGLFragmentProgramFromSource(glDownSampleShader, glShader_, glprogram_);
/////////////////////
// create msaa FBO //
/////////////////////
glGenFramebuffers(1, &msaaFrameBufferOBJ_);
glBindFramebuffer(GL_FRAMEBUFFER, msaaFrameBufferOBJ_);
// create textures
glGenTextures(1, &msaaColorBuffer_);
glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, msaaColorBuffer_);
glTexImage2DMultisample(GL_TEXTURE_2D_MULTISAMPLE, numSamples, GL_RGBA8,
c_dimSize, c_dimSize, GL_TRUE);
glGenTextures(1, &msaaDepthBuffer_);
glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, msaaDepthBuffer_);
glTexImage2DMultisample(GL_TEXTURE_2D_MULTISAMPLE, numSamples,
GL_DEPTH_COMPONENT24, c_dimSize, c_dimSize, GL_TRUE);
//
glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, msaaColorBuffer_,
0);
glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, msaaDepthBuffer_,
0);
// verify all resource allocations are well.
GLenum status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
if (GL_FRAMEBUFFER_COMPLETE != status) {
return false;
}
// set up gl state machine
glViewport(0, 0, c_dimSize, c_dimSize); // Reset The Current Viewport
glMatrixMode(GL_PROJECTION); // Select The Projection Matrix
glLoadIdentity(); // Reset The Projection Matrix
gluPerspective(30.0f, (GLfloat)c_dimSize / (GLfloat)c_dimSize, 0.1f, 100.0f);
glMatrixMode(GL_MODELVIEW); // Select The Modelview Matrix
glLoadIdentity();
glEnable(GL_DEPTH_TEST);
// The Type Of Depth Testing To Do
glClear(GL_COLOR_BUFFER_BIT |
GL_DEPTH_BUFFER_BIT); // Clear Screen And Depth Buffer
glBegin(GL_QUADS); // Draw A Quad
glVertex3f(-1.0f, 1.0f, -6.0f); // Top Left
glVertex3f(1.0f, 1.0f, -6.0f); // Top Right
glVertex3f(1.0f, -1.0f, -3.0f); // Bottom Right
glVertex3f(-1.0f, -1.0f, -3.0f); // Bottom Left
glEnd();
glFinish();
cl_int error;
clOutputBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
bufferSize, NULL, &error);
if (CL_SUCCESS != error) return false;
clMsaa_ = _wrapper->clCreateFromGLTexture(context_, CL_MEM_READ_WRITE,
GL_TEXTURE_2D_MULTISAMPLE, 0,
msaaColorBuffer_, &error);
if (CL_SUCCESS != error) return false;
GLsizei samples;
error = _wrapper->clGetGLTextureInfo(clMsaa_, CL_GL_NUM_SAMPLES,
sizeof(samples), &samples, NULL);
error = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 1,
&clMsaa_, 0, NULL, NULL);
if (CL_SUCCESS != error) return false;
_wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &clOutputBuffer_);
_wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), &clMsaa_);
_wrapper->clSetKernelArg(kernel_, 2, sizeof(unsigned int), &numSamples);
_wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2, NULL,
dimSizes, NULL, 0, NULL, NULL);
_wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 1, &clMsaa_, 0,
NULL, NULL);
pGLOutput_ = (unsigned int*)malloc(bufferSize);
pCLOutput_ = (unsigned int*)malloc(bufferSize);
_wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], clOutputBuffer_, CL_TRUE,
0, bufferSize, pCLOutput_, 0, NULL, NULL);
// down sample
glBindFramebuffer(GL_FRAMEBUFFER, 0);
glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, msaaColorBuffer_);
glUseProgram(glprogram_);
glUniform1i(glGetUniformLocation(glprogram_, "numSamples"), numSamples);
glUniform2i(glGetUniformLocation(glprogram_, "resolution"), c_dimSize,
c_dimSize);
glUniform1i(glGetUniformLocation(glprogram_, "MsaaTex"), 0);
// printOpenGLError();
glBegin(GL_QUADS);
glVertex2f(-1.0f, 1.0f);
glTexCoord2f(1.0f, 0.0f);
glVertex2f(1.0f, 1.0f);
glTexCoord2f(1.0f, 1.0f);
glVertex2f(1.0f, -1.0f);
glTexCoord2f(0.0f, 1.0f);
glVertex2f(-1.0f, -1.0f);
glTexCoord2f(0.0f, 0.0f);
glEnd();
glBindTexture(GL_TEXTURE_2D_MULTISAMPLE, 0);
glUseProgram(0);
glReadPixels(0, 0, c_dimSize, c_dimSize, GL_BGRA, GL_UNSIGNED_BYTE,
pGLOutput_);
if (absDiff(pGLOutput_, pCLOutput_, c_dimSize)) retVal = true;
return retVal;
}
bool OCLGLMsaaTexture::absDiff(unsigned int* pGLBuffer, unsigned int* pCLBuffer,
const unsigned int c_dimSize) {
bool retVal = true;
for (unsigned int i = 0; i < c_dimSize * c_dimSize; i++) {
char clPixel[4];
char glPixel[4];
char diff[4] = {0};
memcpy(clPixel, &(pCLBuffer[i]), sizeof(clPixel));
memcpy(glPixel, &(pGLBuffer[i]), sizeof(glPixel));
for (int j = 0; j < 4; j++) {
diff[j] = abs(clPixel[j] - glPixel[i]);
if (diff[j] > 10) retVal = false;
}
}
return retVal;
}
@@ -0,0 +1,68 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_GL_MSAA_TEXTURE_H_
#define _OCL_GL_MSAA_TEXTURE_H_
#include "OCLGLCommon.h"
class OCLGLMsaaTexture : public OCLGLCommon {
public:
OCLGLMsaaTexture();
virtual ~OCLGLMsaaTexture();
static const unsigned int c_dimSize = 128;
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceId);
virtual void run(void);
virtual unsigned int close(void);
private:
////////////////////
// test functions //
////////////////////
bool testMsaaRead(GLint internalFormat, unsigned int NumSamples);
unsigned int _currentTest;
//////////////////////////////
// private helper functions //
//////////////////////////////
// returns element size in bytes.
static bool absDiff(unsigned int* pGLBuffer, unsigned int* pCLBuffer,
const unsigned int dimSize);
/////////////////////
// private members //
/////////////////////
// GL resource identifiers
GLuint msaaDepthBuffer_;
GLuint msaaFrameBufferOBJ_;
GLuint msaaColorBuffer_;
GLuint glShader_;
GLuint glprogram_;
// CL identifiers
cl_mem clOutputBuffer_;
cl_mem clMsaa_;
unsigned int* pGLOutput_;
unsigned int* pCLOutput_;
};
#endif // _OCL_GL_BUFFER_H_
@@ -0,0 +1,231 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLGLMultiContext.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
const static char* strKernel =
"__kernel void glmulticontext_test( __global uint4 *source, __global uint4 "
"*dest) \n"
"{ "
" \n"
" int tid = get_global_id(0); "
" \n"
" dest[ tid ] = source[ tid ] + (uint4)(1); "
" \n"
"} "
" \n";
OCLGLMultiContext::OCLGLMultiContext() {
memset(contextData_, 0, sizeof(contextData_));
_numSubTests = 1;
}
OCLGLMultiContext::~OCLGLMultiContext() {}
void OCLGLMultiContext::open(unsigned int test, char* units, double& conversion,
unsigned int deviceId) {
// Initialize random number seed
srand((unsigned int)time(NULL));
OCLGLCommon::open(test, units, conversion, deviceId);
if (_errorFlag) return;
cl_context_properties properties[7] = {0};
for (unsigned int i = 0; i < c_glContextCount; i++) {
createGLContext(contextData_[i].glContext);
getCLContextPropertiesFromGLContext(contextData_[i].glContext, properties);
// Create new CL context from GL context
contextData_[i].clContext = _wrapper->clCreateContext(
properties, 1, &devices_[_deviceId], NULL, NULL, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateContext() failed (%d)",
error_);
// Create command queue for new context
contextData_[i].clCmdQueue = _wrapper->clCreateCommandQueue(
contextData_[i].clContext, devices_[_deviceId], 0, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed (%d)",
error_);
// Build the kernel
contextData_[i].clProgram = _wrapper->clCreateProgramWithSource(
contextData_[i].clContext, 1, &strKernel, NULL, &error_);
CHECK_RESULT((error_ != CL_SUCCESS),
"clCreateProgramWithSource() failed (%d)", error_);
error_ = _wrapper->clBuildProgram(contextData_[i].clProgram, 1,
&devices_[deviceId], NULL, NULL, NULL);
if (error_ != CL_SUCCESS) {
char programLog[1024];
_wrapper->clGetProgramBuildInfo(contextData_[i].clProgram,
devices_[deviceId], CL_PROGRAM_BUILD_LOG,
1024, programLog, 0);
printf("\n%s\n", programLog);
fflush(stdout);
}
CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)",
error_);
contextData_[i].clKernel = _wrapper->clCreateKernel(
contextData_[i].clProgram, "glmulticontext_test", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)",
error_);
}
}
void OCLGLMultiContext::run() {
if (_errorFlag) {
return;
}
cl_uint4 inOutData[c_numOfElements] = {{{0}}};
cl_uint4 expectedData[c_numOfElements] = {{{0}}};
// Initialize input data with random values
for (unsigned int i = 0; i < c_numOfElements; i++) {
for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) {
inOutData[i].s[j] = (unsigned int)rand();
expectedData[i].s[j] = inOutData[i].s[j] + c_glContextCount;
}
}
for (unsigned int i = 0; i < c_glContextCount; i++) {
makeCurrent(contextData_[i].glContext);
// Generate and Bind in & out OpenGL buffers
GLuint inGLBuffer = 0, outGLBuffer = 0;
glGenBuffers(1, &inGLBuffer);
glGenBuffers(1, &outGLBuffer);
glBindBuffer(GL_ARRAY_BUFFER, inGLBuffer);
glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), inOutData,
GL_STATIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer);
glBufferData(GL_ARRAY_BUFFER, c_numOfElements * sizeof(cl_uint4), NULL,
GL_STATIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, 0);
glFinish();
// Create input buffer from GL input buffer
contextData_[i].inputBuffer = _wrapper->clCreateFromGLBuffer(
contextData_[i].clContext, CL_MEM_READ_ONLY, inGLBuffer, &error_);
CHECK_RESULT((error_ != CL_SUCCESS),
"Unable to create input GL buffer (%d)", error_);
// Create output buffer from GL output buffer
contextData_[i].outputBuffer = _wrapper->clCreateFromGLBuffer(
contextData_[i].clContext, CL_MEM_WRITE_ONLY, outGLBuffer, &error_);
CHECK_RESULT((error_ != CL_SUCCESS),
"Unable to create output GL buffer (%d)", error_);
error_ =
_wrapper->clSetKernelArg(contextData_[i].clKernel, 0, sizeof(cl_mem),
&(contextData_[i].inputBuffer));
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
error_);
error_ =
_wrapper->clSetKernelArg(contextData_[i].clKernel, 1, sizeof(cl_mem),
&(contextData_[i].outputBuffer));
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed (%d)",
error_);
error_ = _wrapper->clEnqueueAcquireGLObjects(contextData_[i].clCmdQueue, 1,
&(contextData_[i].inputBuffer),
0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)",
error_);
error_ = _wrapper->clEnqueueAcquireGLObjects(
contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 0, NULL,
NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "Unable to acquire GL objects (%d)",
error_);
size_t gws[1] = {c_numOfElements};
error_ = _wrapper->clEnqueueNDRangeKernel(contextData_[i].clCmdQueue,
contextData_[i].clKernel, 1, NULL,
gws, NULL, 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed (%d)",
error_);
error_ = _wrapper->clEnqueueReleaseGLObjects(contextData_[i].clCmdQueue, 1,
&(contextData_[i].inputBuffer),
0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS),
"clEnqueueReleaseGLObjects failed (%d)", error_);
error_ = _wrapper->clEnqueueReleaseGLObjects(
contextData_[i].clCmdQueue, 1, &(contextData_[i].outputBuffer), 0, NULL,
NULL);
CHECK_RESULT((error_ != CL_SUCCESS),
"clEnqueueReleaseGLObjects failed (%d)", error_);
error_ = _wrapper->clFinish(contextData_[i].clCmdQueue);
CHECK_RESULT((error_ != CL_SUCCESS), "clFinish() failed (%d)", error_);
glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer);
void* glMem = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY);
memcpy(inOutData, glMem, c_numOfElements * sizeof(cl_uint4));
glUnmapBuffer(GL_ARRAY_BUFFER);
_wrapper->clReleaseMemObject(contextData_[i].inputBuffer);
_wrapper->clReleaseMemObject(contextData_[i].outputBuffer);
// Delete GL buffers
glBindBuffer(GL_ARRAY_BUFFER, 0);
glDeleteBuffers(1, &inGLBuffer);
inGLBuffer = 0;
glDeleteBuffers(1, &outGLBuffer);
outGLBuffer = 0;
}
// Compare expected output with actual data received
for (unsigned int i = 0; i < c_numOfElements; i++) {
for (unsigned int j = 0; j < sizeof(cl_uint4) / sizeof(cl_uint); j++) {
CHECK_RESULT((inOutData[i].s[j] != expectedData[i].s[j]),
"Element %d is incorrect!\n\t \
expected:{%d, %d, %d, %d} differs from actual:{%d, %d, %d, %d}",
i, expectedData[i].s[0], expectedData[i].s[1],
expectedData[i].s[2], expectedData[i].s[3],
inOutData[i].s[0], inOutData[i].s[1], inOutData[i].s[2],
inOutData[i].s[3]);
}
}
}
unsigned int OCLGLMultiContext::close() {
for (unsigned int i = 0; i < c_glContextCount; i++) {
makeCurrent(contextData_[i].glContext);
_wrapper->clReleaseKernel(contextData_[i].clKernel);
_wrapper->clReleaseProgram(contextData_[i].clProgram);
_wrapper->clReleaseCommandQueue(contextData_[i].clCmdQueue);
_wrapper->clReleaseContext(contextData_[i].clContext);
destroyGLContext(contextData_[i].glContext);
}
return OCLGLCommon::close();
}
@@ -0,0 +1,54 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_GL_MULTI_CONTEXT_H_
#define _OCL_GL_MULTI_CONTEXT_H_
#include "OCLGLCommon.h"
class OCLGLMultiContext : public OCLGLCommon {
public:
OCLGLMultiContext();
virtual ~OCLGLMultiContext();
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceId);
virtual void run(void);
virtual unsigned int close(void);
private:
static const unsigned int c_glContextCount = 3;
static const unsigned int c_numOfElements = 128;
struct GLContextDataSet {
OCLGLHandle glContext;
cl_context clContext;
cl_command_queue clCmdQueue;
cl_program clProgram;
cl_kernel clKernel;
cl_mem inputBuffer;
cl_mem outputBuffer;
};
GLContextDataSet contextData_[c_glContextCount];
bool failed_;
};
#endif // _OCL_GL_MULTI_CONTEXT_H_
@@ -0,0 +1,144 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLGLTexture.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
const static char* strKernelui =
"__kernel void gltexture_test(read_only image2d_t source, write_only "
"image2d_t dest) \n"
"{ "
" \n"
" int tidX = get_global_id(0); "
" \n"
" int tidY = get_global_id(1); "
" \n"
" uint4 pixel = read_imageui(source, (int2)(tidX, tidY)); "
" \n"
" write_imageui(dest, (int2)(tidX, tidY), pixel); "
" \n"
"}";
const static char* strKernelf =
"__kernel void gltexture_test(read_only image2d_t source, write_only "
"image2d_t dest) \n"
"{ "
" \n"
" int tidX = get_global_id(0); "
" \n"
" int tidY = get_global_id(1); "
" \n"
" float4 pixel = read_imagef(source, (int2)(tidX, tidY)); "
" \n"
" write_imagef(dest, (int2)(tidX, tidY), pixel); "
" \n"
"} "
" \n";
OCLGLTexture::OCLGLTexture()
: inDataGL_(NULL), outDataGL_(NULL), inGLTexture_(0), outGLTexture_(0) {
_numSubTests = 4 * 2;
}
OCLGLTexture::~OCLGLTexture() {}
void OCLGLTexture::open(unsigned int test, char* units, double& conversion,
unsigned int deviceId) {
// Initialize random number seed
srand((unsigned int)time(NULL));
OCLGLCommon::open(test, units, conversion, deviceId);
if (_errorFlag) return;
currentTest_ = test % 4;
testRender_ = ((test / 4) >= 1) ? true : false;
// Build the kernel
if (0 == currentTest_) {
program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernelui,
NULL, &error_);
} else {
program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernelf,
NULL, &error_);
}
CHECK_RESULT((error_ != CL_SUCCESS),
"clCreateProgramWithSource() failed (%d)", error_);
error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
NULL, NULL);
if (error_ != CL_SUCCESS) {
char programLog[1024];
_wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
printf("\n%s\n", programLog);
fflush(stdout);
}
CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)", error_);
kernel_ = _wrapper->clCreateKernel(program_, "gltexture_test", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)", error_);
}
void OCLGLTexture::run(void) {
bool retVal = false;
switch (currentTest_) {
case 0:
retVal = runTextureTest<unsigned int>(GL_RGBA32UI, GL_RGBA_INTEGER,
GL_UNSIGNED_INT);
break;
case 1:
retVal =
runTextureTest<unsigned char>(GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE);
break;
case 2:
retVal = runTextureTest<short>(GL_RGBA16, GL_RGBA, GL_SHORT);
break;
case 3:
retVal = runTextureTest<float>(GL_RGBA32F, GL_RGBA, GL_FLOAT);
break;
default:
CHECK_RESULT(true, "unsupported test number\n");
}
CHECK_RESULT((retVal != true), "cl-gl texture interop test failed ");
}
unsigned int OCLGLTexture::close(void) {
clReleaseMemObject(buffers_[0]);
clReleaseMemObject(buffers_[1]);
buffers_.clear();
// Delete GL in & out buffers
glFinish();
glBindTexture(GL_TEXTURE_2D, 0);
glDeleteTextures(1, &inGLTexture_);
inGLTexture_ = 0;
glDeleteTextures(1, &outGLTexture_);
outGLTexture_ = 0;
free(inDataGL_);
inDataGL_ = NULL;
free(outDataGL_);
outDataGL_ = NULL;
return OCLGLCommon::close();
}
@@ -0,0 +1,214 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_GL_TEXTURE_H_
#define _OCL_GL_TEXTURE_H_
#include <iostream>
#include "OCLGLCommon.h"
class OCLGLTexture : public OCLGLCommon {
public:
static const unsigned int c_imageWidth = 512;
static const unsigned int c_imageHeight = 512;
static const unsigned int c_elementsPerPixel = 4;
OCLGLTexture();
virtual ~OCLGLTexture();
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceId);
virtual void run(void);
virtual unsigned int close(void);
private:
unsigned int currentTest_;
void* inDataGL_;
void* outDataGL_;
GLuint inGLTexture_;
GLuint outGLTexture_;
bool testRender_;
template <typename T>
bool runTextureTest(GLint internalFormat, GLenum format, GLenum type);
};
template <typename T>
bool OCLGLTexture::runTextureTest(GLint internalFormat, GLenum format,
GLenum type) {
cl_mem image;
inDataGL_ =
malloc(c_imageWidth * c_imageHeight * c_elementsPerPixel * sizeof(T));
outDataGL_ =
malloc(c_imageWidth * c_imageHeight * c_elementsPerPixel * sizeof(T));
// Initialize input data with random values
T* inputIterator = (T*)inDataGL_;
for (unsigned int i = 0;
i < c_imageWidth * c_imageHeight * c_elementsPerPixel; i++) {
inputIterator[i] = (T)(rand() % 255);
}
// Initialize output data with zeros
memset(outDataGL_, 0,
c_imageWidth * c_imageHeight * c_elementsPerPixel * sizeof(T));
// Generate and Bind in & out OpenGL textures
glGenTextures(1, &inGLTexture_);
glGenTextures(1, &outGLTexture_);
glBindTexture(GL_TEXTURE_2D, inGLTexture_);
glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexImage2D(GL_TEXTURE_2D, 0, internalFormat, (GLsizei)c_imageWidth,
(GLsizei)c_imageHeight, 0, format, type, inDataGL_);
glBindTexture(GL_TEXTURE_2D, outGLTexture_);
glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexImage2D(GL_TEXTURE_2D, 0, internalFormat, (GLsizei)c_imageWidth,
(GLsizei)c_imageHeight, 0, format, type, outDataGL_);
glFinish();
// Create input buffer from GL input texture
image = _wrapper->clCreateFromGLTexture(
context_, CL_MEM_READ_ONLY, GL_TEXTURE_2D, 0, inGLTexture_, &error_);
if (error_ != CL_SUCCESS) {
printf("Unable to create input buffer from GL texture (%d)", error_);
return false;
}
buffers_.push_back(image);
// Create output buffer from GL output texture
image = _wrapper->clCreateFromGLTexture(
context_, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, outGLTexture_, &error_);
if (error_ != CL_SUCCESS) {
printf("Unable to create output buffer from GL texture (%d)", error_);
return false;
}
buffers_.push_back(image);
size_t gws[2] = {c_imageWidth, c_imageHeight};
// Assign args
for (unsigned int i = 0; i < buffers_.size(); i++) {
error_ =
_wrapper->clSetKernelArg(kernel_, i, sizeof(cl_mem), &buffers()[i]);
if (error_ != CL_SUCCESS) {
printf("clSetKernelArg() failed (%d)", error_);
return false;
}
}
int loop = (testRender_) ? 2 : 1;
for (int l = 0; l < loop; ++l) {
if (testRender_ && (l == 0)) {
GLuint FrameBufferName = 0;
glGenFramebuffers(1, &FrameBufferName);
glBindFramebuffer(GL_FRAMEBUFFER, FrameBufferName);
glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, inGLTexture_,
0);
glClearColor(.5f, 1.f, 1.0f, 0);
glClear(GL_COLOR_BUFFER_BIT);
glFinish();
}
error_ = _wrapper->clEnqueueAcquireGLObjects(cmdQueues_[_deviceId], 2,
&buffers()[0], 0, NULL, NULL);
if (error_ != CL_SUCCESS) {
printf("Unable to acquire GL objects (%d)", error_);
return false;
}
error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2,
NULL, gws, NULL, 0, NULL, NULL);
if (error_ != CL_SUCCESS) {
printf("clEnqueueNDRangeKernel() failed (%d)", error_);
return false;
}
error_ = _wrapper->clEnqueueReleaseGLObjects(cmdQueues_[_deviceId], 2,
&buffers()[0], 0, NULL, NULL);
if (error_ != CL_SUCCESS) {
printf("clEnqueueReleaseGLObjects failed (%d)", error_);
return false;
}
error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
if (error_ != CL_SUCCESS) {
printf("clFinish() failed (%d)", error_);
return false;
}
if (testRender_ && (l == 0)) {
glClearColor(1.f, 1.f, 1.f, 1.f);
glClear(GL_COLOR_BUFFER_BIT);
glFinish();
}
}
// Get the results from GL texture
glBindTexture(GL_TEXTURE_2D, outGLTexture_);
glActiveTexture(GL_TEXTURE0);
glGetTexImage(GL_TEXTURE_2D, 0, format, type, outDataGL_);
// Check output texture data
inputIterator = (T*)inDataGL_;
T* outputIterator = (T*)outDataGL_;
T color;
switch (type) {
case GL_UNSIGNED_INT:
color = (T)0x3f800000;
break;
case GL_UNSIGNED_BYTE:
color = (T)0xff;
break;
case GL_SHORT:
color = (T)0x7fff;
break;
case GL_FLOAT:
color = (T)1.f;
break;
default:
return false;
}
for (unsigned int i = 0;
i < c_imageWidth * c_imageHeight * c_elementsPerPixel; i++) {
if (testRender_) {
if (outputIterator[i] != color) {
std::cout << "Element " << i
<< " in output texture is incorrect! (internal format = "
<< internalFormat << "\n\t expected:" << inputIterator[i]
<< " differs from actual clear color:" << color << std::endl;
return false;
}
} else if (inputIterator[i] != outputIterator[i]) {
std::cout << "Element " << i
<< " in output texture is incorrect! (internal format = "
<< internalFormat << "\n\t expected:" << inputIterator[i]
<< " differs from actual: " << outputIterator[i] << std::endl;
return false;
}
}
return true;
}
#endif // _OCL_GL_TEXTURE_H_
@@ -0,0 +1,54 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLTestListImp.h"
//
// Includes for tests
//
#include "OCLGLBuffer.h"
#include "OCLGLBufferMultipleQueues.h"
#include "OCLGLDepthBuffer.h"
#include "OCLGLDepthTex.h"
#include "OCLGLFenceSync.h"
#include "OCLGLMsaaTexture.h"
#include "OCLGLMultiContext.h"
#include "OCLGLTexture.h"
//
// Helper macro for adding tests
//
template <typename T>
static void* dictionary_CreateTestFunc(void) {
return new T();
}
#define TEST(name) \
{ #name, &dictionary_CreateTestFunc < name> }
TestEntry TestList[] = {
TEST(OCLGLBuffer), TEST(OCLGLBufferMultipleQueues),
TEST(OCLGLTexture), TEST(OCLGLMultiContext),
TEST(OCLGLFenceSync), TEST(OCLGLDepthTex),
};
unsigned int TestListCount = sizeof(TestList) / sizeof(TestList[0]);
unsigned int TestLibVersion = 0;
const char* TestLibName = "oclgl";
@@ -0,0 +1 @@
# all clear
@@ -0,0 +1,206 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _BaseTestImp_H_
#define _BaseTestImp_H_
#include <algorithm>
#include <cassert>
#include <cstdio>
#include <cstring>
#include <vector>
#include "OCLTest.h"
#include "OCLWrapper.h"
#define EXIT_SILENT_FAILURE 2
#define KERNEL(...) #__VA_ARGS__
#ifdef _MSC_VER
#define snprintf sprintf_s
#endif
#define CHECK_ERROR(error, msg) \
if (error != CL_SUCCESS) { \
_errorFlag = true; \
printf("\n\n%s\nError code: %d\n\n", msg, error); \
_errorMsg = msg; \
_crcword += 1; \
return; \
}
#define CHECK_ERROR_NO_RETURN(error, msg) \
if (error != CL_SUCCESS) { \
_errorFlag = true; \
printf("\n\n%s\nError code: %d\n\n", msg, error); \
_errorMsg = msg; \
_crcword += 1; \
}
#define CHECK_RESULT(test, msg, ...) \
if ((test)) { \
char* buf = (char*)malloc(4096); \
_errorFlag = true; \
int rc = snprintf(buf, 4096, msg, ##__VA_ARGS__); \
assert(rc >= 0 && rc < (int)4096); \
printf("%s:%d - %s\n", __FILE__, __LINE__, buf); \
_errorMsg = std::string(buf); \
_crcword += 1; \
free(buf); \
return; \
}
#define CHECK_RESULT_ARGS CHECK_RESULT
#define CHECK_RESULT_NO_RETURN(test, msg, ...) \
if ((test)) { \
char* buf = (char*)malloc(4096); \
_errorFlag = true; \
int rc = snprintf(buf, 4096, msg, ##__VA_ARGS__); \
assert(rc >= 0 && rc < (int)4096); \
printf("%s:%d - %s\n", __FILE__, __LINE__, buf); \
_errorMsg = std::string(msg); \
_crcword += 1; \
free(buf); \
}
#define CHECK_RESULT_NO_RETURN_ARGS CHECK_RESULT_NO_RETURN
#define CHECK_RESULT_SHUTDOWN(test, msg) \
if ((test)) { \
_errorFlag = true; \
printf("%s\n", msg); \
_errorMsg = msg; \
_crcword += 1; \
close(); \
return; \
}
#define CHECK_RESULT_CL(test, msg) \
if ((test)) { \
_errorFlag = true; \
printf("%s\n", msg); \
_errorMsg = msg; \
_crcword += 1; \
return 1; \
}
class BaseTestImp : public OCLTest {
public:
BaseTestImp();
virtual ~BaseTestImp();
public:
virtual unsigned int getThreadUsage(void);
virtual int getNumSubTests(void);
//! Abstract functions being defined here
virtual void open();
virtual void open(unsigned int test, const char* deviceName,
unsigned int architecture);
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceId, unsigned int platformIndex) {
return open(test, "Tahiti", platformIndex);
}
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceId) {
return open(test, "Tahiti", 0);
}
virtual void run(void) = 0;
virtual unsigned int close(void);
//! Functions to set class members
virtual void checkComplib(unsigned int test, const char* deviceName,
unsigned int architecture);
virtual void setDeviceName(const char*);
virtual const char* getDeviceName();
virtual void setErrorMsg(const char* error);
virtual const char* getErrorMsg(void);
virtual bool hasErrorOccured(void);
virtual void clearError();
BaseTestImp* toBaseTestImp() { return this; }
virtual OCLTestImp* toOCLTestImp() { return NULL; }
virtual void useCPU() { _cpu = true; }
virtual void setIterationCount(int cnt);
virtual void setDeviceId(unsigned int deviceId);
virtual unsigned int getDeviceId();
virtual void setPlatformIndex(unsigned int platformIndex);
virtual unsigned int getPlatformIndex();
virtual float getPerfInfo();
virtual void clearPerfInfo();
protected:
unsigned int _numSubTests;
unsigned int _openTest;
unsigned int _useThreads;
int _iterationCnt;
float _perfInfo;
bool _cpu;
unsigned int _crcword;
unsigned int _crctab[256];
bool _errorFlag;
std::string _errorMsg;
const char* _deviceName;
unsigned int _architecture;
unsigned int _deviceId;
unsigned int _platformIndex;
bool failed_ = false;
cl_int error_;
cl_uint type_;
cl_uint deviceCount_;
cl_device_id* devices_;
cl_context context_;
cl_program program_;
cl_kernel kernel_;
};
// enum to keep track of different memory types
enum MemType { LOOCL, REMOTE_CACHED, REMOTE_UNCACHED };
class DataType {
cl_image_format f;
const char* str;
unsigned int size;
public:
DataType() {}
DataType(cl_image_format f, const char* str, unsigned int size) {
this->f = f;
this->str = str;
this->size = size;
}
operator const char*() { return str; }
operator unsigned int() { return size; }
operator cl_image_format() { return f; }
};
// useful for initialization of an array of data types for a test
#define DTYPE(x, y) DataType(x, #x, (unsigned int)y)
#endif
@@ -0,0 +1,83 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCLTestImp_H_
#define _OCLTestImp_H_
#include <string>
#include <vector>
#include "BaseTestImp.h"
#include "CL/cl.h"
#include "OCL/Thread.h"
#include "OCLTest.h"
#include "OCLWrapper.h"
class OCLTestImp : public BaseTestImp {
public:
OCLTestImp();
virtual ~OCLTestImp();
public:
//! Abstract functions being defined here
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceId, unsigned int platformIndex);
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceId);
virtual void run(void) = 0;
virtual unsigned int close(void);
//! Functions to set class members
public:
void useCPU();
int genIntRand(int a, int b);
int genBitRand(int n);
void accumulateCRC(const void* buffer, int len);
void setOCLWrapper(OCLWrapper* wrapper);
OCLTestImp* toOCLTestImp() { return this; }
static OCLutil::Lock openDeviceLock;
static OCLutil::Lock compileLock;
protected:
const std::vector<cl_mem>& buffers() const { return buffers_; }
OCLWrapper* _wrapper;
int _seed;
// Common data of any CL program
cl_int error_;
cl_uint type_;
cl_uint deviceCount_;
cl_device_id* devices_;
cl_platform_id platform_;
std::vector<cl_command_queue> cmdQueues_;
cl_context context_;
cl_program program_;
cl_kernel kernel_;
std::vector<cl_mem> buffers_;
};
// useful for initialization of an array of data types for a test
#define DTYPE(x, y) DataType(x, #x, (unsigned int)y)
#endif
@@ -0,0 +1,86 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef __Dictionary_h__
#define __Dictionary_h__
//
// Testing module (plugin) interface forward declarations
//
#ifdef ATI_OS_WIN
#define OCL_DLLEXPORT __declspec(dllexport)
#define OCL_CALLCONV __cdecl
#endif
#ifdef ATI_OS_LINUX
#define OCL_DLLEXPORT
#define OCL_CALLCONV
#endif
class OCLTest;
//
// OCLTestList_TestCount - retrieve the number of tests in the testing module
//
extern "C" OCL_DLLEXPORT unsigned int OCL_CALLCONV OCLTestList_TestCount(void);
//
// OCLTestList_TestLibVersion - retrieve the version of test lib in the testing
// module
//
extern "C" OCL_DLLEXPORT unsigned int OCL_CALLCONV
OCLTestList_TestLibVersion(void);
//
// OCLTestList_TestLibName - retrieve the name of test library
//
extern "C" OCL_DLLEXPORT const char* OCL_CALLCONV OCLTestList_TestLibName(void);
//
// OCLTestList_TestName - retrieve the name of the indexed test in the module
//
extern "C" OCL_DLLEXPORT const char* OCL_CALLCONV
OCLTestList_TestName(unsigned int testNum);
//
// OCLTestList_CreateTest - create a test by index
//
extern "C" OCL_DLLEXPORT OCLTest* OCL_CALLCONV
OCLTestList_CreateTest(unsigned int testNum);
//
// OCLTestList_DestroyTest - destroy a test object
//
extern "C" OCL_DLLEXPORT void OCL_CALLCONV
OCLTestList_DestroyTest(OCLTest* test);
//
// internal global data that is populated in each dll
//
typedef struct _TestEntry {
const char* name;
void* (*create)(void);
} TestEntry;
extern TestEntry TestList[];
extern unsigned int TestListCount;
extern unsigned int TestLibVersion;
extern const char* TestLibName;
#endif
@@ -0,0 +1,32 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_INCLUDES_H
#define _OCL_INCLUDES_H
#ifdef ATI_OS_WIN
#define POINTER_64 __ptr64
#include <windows.h>
#include "d3d9.h"
#endif
#include "CL/cl.h"
#endif //_OCL_INCLUDES_H
@@ -0,0 +1,211 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerf3DImageWriteSpeed.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include "CL/opencl.h"
#include "Timer.h"
// Quiet pesky warnings
#ifdef WIN_OS
#define SNPRINTF sprintf_s
#else
#define SNPRINTF snprintf
#endif
#define KERNEL_CODE(...) #__VA_ARGS__
#define NUM_SIZES 4
static const unsigned int Sizes[NUM_SIZES] = {64, 128, 256, 512};
#define NUM_FORMATS 1
static const cl_image_format formats[NUM_FORMATS] = {
{CL_RGBA, CL_UNSIGNED_INT8}};
static const char *textFormats[NUM_FORMATS] = {"CL_RGBA , CL_UNSIGNED_INT8"};
static const unsigned int formatSize[NUM_FORMATS] = {sizeof(CL_UNSIGNED_INT8)};
const static char *strKernel = {KERNEL_CODE(
\n __kernel void image_kernel(write_only image3d_t input) {
size_t x = get_global_id(0);
size_t y = get_global_id(1);
size_t z = get_global_id(2);
int4 coords = (int4)(x, y, z, 0);
write_imageui(input, coords, (1, 1, 1, 1));
}
\n)};
OCLPerf3DImageWriteSpeed::OCLPerf3DImageWriteSpeed() {
_numSubTests = NUM_SIZES * NUM_FORMATS;
}
OCLPerf3DImageWriteSpeed::~OCLPerf3DImageWriteSpeed() {}
static void CL_CALLBACK notify_callback(const char *errinfo,
const void *private_info, size_t cb,
void *user_data) {}
void OCLPerf3DImageWriteSpeed::open(unsigned int test, char *units,
double &conversion, unsigned int deviceId) {
error_ = CL_SUCCESS;
testId_ = test;
OCLTestImp::open(test, units, conversion, deviceId);
CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
program_ = 0;
kernel_ = 0;
cmd_queue_ = 0;
imageBuffer_ = 0;
skip_ = false;
char charbuf[1024];
size_t retsize;
error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_EXTENSIONS,
1024, charbuf, &retsize);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
if (!strstr(charbuf, "cl_khr_3d_image_writes")) {
skip_ = true;
testDescString = "3D Write not supported. Test Skipped.";
return;
}
bufSize_ = Sizes[test % NUM_SIZES];
bufnum_ = (test / NUM_SIZES) % NUM_FORMATS;
memSize_ = bufSize_ * bufSize_ * bufSize_ * formatSize[bufnum_];
cmd_queue_ = cmdQueues_[_deviceId];
program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
&error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed");
error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
"-cl-std=CL2.0", NULL, NULL);
if (error_ != CL_SUCCESS) {
char programLog[1024];
_wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
printf("\n%s\n", programLog);
fflush(stdout);
}
CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
kernel_ = _wrapper->clCreateKernel(program_, "image_kernel", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
imageBuffer_ = _wrapper->clCreateImage3D(
context_, CL_MEM_WRITE_ONLY, &formats[bufnum_], bufSize_, bufSize_,
bufSize_, 0, 0, NULL, &error_);
CHECK_RESULT(imageBuffer_ == 0, "clCreateImage(imageBuffer_) failed");
// set kernel arguments
error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &imageBuffer_);
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
}
void OCLPerf3DImageWriteSpeed::run(void) {
if (skip_) {
return;
}
CPerfCounter timer;
unsigned int fmt_num = (testId_ / NUM_SIZES) % NUM_FORMATS;
size_t gws[3] = {bufSize_, bufSize_, bufSize_};
size_t lws[3] = {8, 8, 4};
// warm up
error_ = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, kernel_, 3, NULL, gws,
lws, 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
_wrapper->clFinish(cmd_queue_);
// checkData
char *bufptr = (char *)malloc(memSize_);
size_t origin[3] = {0, 0, 0};
size_t region[3] = {bufSize_, bufSize_, bufSize_};
size_t image_row_pitch = bufSize_ * formatSize[bufnum_];
size_t image_slice_pitch = image_row_pitch * bufSize_;
error_ = clEnqueueReadImage(cmd_queue_, imageBuffer_, true, origin, region,
image_row_pitch, image_slice_pitch, bufptr, 0,
NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadImage() failed");
for (size_t i = 0; i < bufSize_ * bufSize_ * bufSize_ * 4; ++i) {
if (bufptr[i] != 1) {
printf("(%4dx%4dx%4d) fmt:%s(%1u) checkData() fail, image_ptr[%u] = %d\n",
bufSize_, bufSize_, bufSize_, textFormats[fmt_num],
formatSize[bufnum_], (unsigned int)i, (int)bufptr[i]);
CHECK_RESULT_NO_RETURN(0, "Data validation failed!\n");
char buf[256];
SNPRINTF(buf, sizeof(buf),
" (%4dx%4dx%4d) fmt:%s(%1d) checkData() FAILED! ", bufSize_,
bufSize_, bufSize_, textFormats[fmt_num], formatSize[bufnum_]);
testDescString = buf;
return;
}
}
delete bufptr;
// test begins
unsigned int numIter = 5;
timer.Reset();
timer.Start();
for (unsigned int i = 0; i < numIter; ++i) {
error_ = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, kernel_, 3, NULL, gws,
lws, 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
_wrapper->clFinish(cmd_queue_);
}
timer.Stop();
double sec = timer.GetElapsedTime();
// write_image speed in GB/s
double perf = ((double)memSize_ * numIter * (double)(1e-09)) / sec;
_perfInfo = (float)perf;
char buf[256];
SNPRINTF(buf, sizeof(buf), " (%3dx%3dx%3d) fmt:%s(%1u) i: %2d (GB/s) ",
bufSize_, bufSize_, bufSize_, textFormats[fmt_num],
formatSize[bufnum_], numIter);
testDescString = buf;
}
unsigned int OCLPerf3DImageWriteSpeed::close(void) {
if (!skip_) {
if (imageBuffer_) {
error_ = _wrapper->clReleaseMemObject(imageBuffer_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseMemObject(imageBuffer_) failed");
}
}
return OCLTestImp::close();
}
@@ -0,0 +1,49 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_3DImageWriteSpeed_H_
#define _OCL_3DImageWriteSpeed_H_
#include "OCLTestImp.h"
class OCLPerf3DImageWriteSpeed : public OCLTestImp {
public:
OCLPerf3DImageWriteSpeed();
virtual ~OCLPerf3DImageWriteSpeed();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
cl_command_queue cmd_queue_;
cl_mem imageBuffer_;
unsigned int bufSize_;
unsigned int bufnum_;
char* memptr;
unsigned int memSize_;
unsigned int testId_;
bool skip_;
};
#endif // _OCL_3DImageWriteSpeed_H_
@@ -0,0 +1,451 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerfAES256.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include "CL/cl.h"
#include "Timer.h"
static const char *aes256_kernel =
"// NOTE: THIS KERNEL WAS ADOPTED FROM SISOFT SANDRA: DO NOT "
"REDISTRIBUTE!!\n"
"inline uint Load(__global uint* pData, const uint iX, const uint iY)\n"
"{\n"
" return pData[iX | (iY << 8)];\n"
"}\n"
"\n"
"\n"
"inline uint4 Load4(__global uint* pData, const uint4 uX, const uint iY)\n"
"{\n"
" uint uExtent = iY << 8;\n"
" uint4 uNdx = uX + uExtent;\n"
" \n"
" return (uint4)(pData[uNdx.x], pData[uNdx.y], pData[uNdx.z], "
"pData[uNdx.w]);\n"
"}\n"
"\n"
"\n"
"__kernel \n"
"__attribute__((vec_type_hint(uint4))) \n"
"void CryptThread(__global uint4* pInput, __global uint4* pOutput,\n"
" __global uint* pTables,\n"
" __global uint4* pKey, const uint iRounds)\n"
"{\n"
" const uint iNdx = get_global_id(0);\n"
" \n"
" uint4 state, istate, tstate;\n"
" state = pInput[iNdx] ^ pKey[iRounds];\n"
" \n"
" for (uint i = iRounds-1; i; i--)\n"
" {\n"
" istate = state & 0xFF;\n"
" tstate = Load4(pTables, istate.xyzw, 0);\n"
"\n"
" istate = (state >> 8) & 0xFF;\n"
" tstate^= Load4(pTables, istate.wxyz, 1);\n"
"\n"
" istate = (state >> 16) & 0xFF;\n"
" tstate^= Load4(pTables, istate.zwxy, 2);\n"
"\n"
" istate = state >> 24;\n"
" tstate^= Load4(pTables, istate.yzwx, 3);\n"
"\n"
" state = tstate ^ pKey[i];\n"
" }\n"
"\n"
" istate = state & 0xFF;\n"
" tstate = Load4(pTables, istate.xyzw, 4);\n"
"\n"
" istate = (state >> 8) & 0xFF;\n"
" tstate |= Load4(pTables, istate.wxyz, 4) << 8;\n"
"\n"
" istate = (state >> 16) & 0xFF;\n"
" tstate |= Load4(pTables, istate.zwxy, 4) << 16;\n"
"\n"
" istate = state >> 24;\n"
" tstate |= Load4(pTables, istate.yzwx, 4) << 24;\n"
"\n"
" pOutput[iNdx] = tstate ^ pKey[0];\n"
"}\n";
static const char *aes256_kernel2 =
"// NOTE: THIS KERNEL WAS ADOPTED FROM SISOFT SANDRA: DO NOT "
"REDISTRIBUTE!!\n"
"#define AES_BLOCK_SIZE 16\n"
"#define AES_TABLE_SIZE 256\n"
"\n"
"#define AES_TABLE_MAX 5\n"
"#define AES_CONST_SIZE (AES_TABLE_SIZE*AES_TABLE_MAX)\n"
"\n"
"#define AES_ROUND_128 10\n"
"#define AES_ROUND_192 12\n"
"#define AES_ROUND_256 14\n"
"#define AES_ROUNDKEY_MAX (AES_BLOCK_SIZE/4*(AES_ROUND_256+1))\n"
"#define _IS_GPU_\n"
"\n"
"\n"
"inline uint Load(\n"
"#ifdef _IS_GPU_\n"
" __local uint* pData,\n"
"#else\n"
" __constant uint* pData,\n"
"#endif\n"
" const uint iX, const uint iY)\n"
"{\n"
" const uint uNdx = iX + iY*AES_TABLE_SIZE;\n"
" return pData[uNdx];\n"
"}\n"
"\n"
"\n"
"inline uint4 Load4(\n"
"#ifdef _IS_GPU_\n"
" __local uint* pData,\n"
"#else\n"
" __constant uint* pData,\n"
"#endif\n"
" const uint4 uX, const uint iY)\n"
"{\n"
" const uint uExtent = iY*AES_TABLE_SIZE;\n"
" const uint4 uNdx = uX + uExtent;\n"
" \n"
" return (uint4)(pData[uNdx.x], pData[uNdx.y], pData[uNdx.z], "
"pData[uNdx.w]);\n"
"}\n"
"\n"
"\n"
"__kernel \n"
"__attribute__((vec_type_hint(uint4)))\n"
"#ifdef KERNEL_MAX_THREADS\n"
"__attribute__((work_group_size_hint(KERNEL_MAX_THREADS, 1, 1)))\n"
"#endif\n"
"void CryptThread(__global const uint4* pInput, __global uint4* pOutput,\n"
" __constant uint* pTables,\n"
" __constant uint4* pKey, const uint iRounds)\n"
"{\n"
" const size_t iNdx = get_global_id(0);\n"
"\n"
"#ifdef _IS_GPU_\n"
" #define Load4T(x, y) Load4(ulTables, x, y)\n"
"\n"
" __local uint ulTables[AES_CONST_SIZE];\n"
"\n"
" const uint iLdx = get_local_id(0);\n"
" if (iLdx < AES_TABLE_SIZE) {\n"
" const uint iGrps = get_local_size(0);\n"
" const uint iLSize = min(iGrps, (uint)AES_TABLE_SIZE);\n"
" const uint iBpL = AES_CONST_SIZE/iLSize;\n"
"\n"
" const uint iStart = iLdx*iBpL;\n"
" const uint iEnd = iStart + iBpL;\n"
"\n"
" for (uint i=iStart; i<iEnd; i++) {\n"
" ulTables[i] = pTables[i];\n"
" }\n"
" }\n"
"\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
"#else\n"
" #define Load4T(x, y) Load4(pTables, x, y)\n"
"#endif\n"
" \n"
" uint4 state, istate, tstate;\n"
" state = pInput[iNdx] ^ pKey[0];\n"
" \n"
" for (uint i = 1; i < iRounds; i++)\n"
" {\n"
" istate = state & 0xFF;\n"
" tstate = Load4T(istate.xyzw, 0);\n"
"\n"
" istate = (state >> 8) & 0xFF;\n"
" tstate^= Load4T(istate.yzwx, 1);\n"
"\n"
" istate = (state >> 16) & 0xFF;\n"
" tstate^= Load4T(istate.zwxy, 2);\n"
"\n"
" istate = state >> 24;\n"
" tstate^= Load4T(istate.wxyz, 3);\n"
"\n"
" state = tstate ^ pKey[i];\n"
" }\n"
"\n"
" istate = state & 0xFF;\n"
" tstate = Load4T(istate.xyzw, 4);\n"
"\n"
" istate = (state >> 8) & 0xFF;\n"
" tstate |= Load4T(istate.yzwx, 4) << 8;\n"
"\n"
" istate = (state >> 16) & 0xFF;\n"
" tstate |= Load4T(istate.zwxy, 4) << 16;\n"
"\n"
" istate = state >> 24;\n"
" tstate |= Load4T(istate.wxyz, 4) << 24;\n"
"\n"
" pOutput[iNdx] = tstate ^ pKey[iRounds];\n"
"}\n";
OCLPerfAES256::OCLPerfAES256() { _numSubTests = 2; }
OCLPerfAES256::~OCLPerfAES256() {}
void OCLPerfAES256::setData(cl_mem buffer, unsigned int val) {
unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
cmd_queue_, buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL,
&error_);
for (unsigned int i = 0; i < bufSize_ / sizeof(unsigned int); i++)
data[i] = val;
error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
NULL);
_wrapper->clFinish(cmd_queue_);
}
void OCLPerfAES256::checkData(cl_mem buffer) {
unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
cmd_queue_, buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL,
&error_);
for (unsigned int i = 0; i < bufSize_ / sizeof(unsigned int); i++) {
}
error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, buffer, data, 0, NULL,
NULL);
_wrapper->clFinish(cmd_queue_);
}
static void CL_CALLBACK notify_callback(const char *errinfo,
const void *private_info, size_t cb,
void *user_data) {}
void OCLPerfAES256::open(unsigned int test, char *units, double &conversion,
unsigned int deviceId) {
cl_uint numPlatforms;
cl_platform_id platform = NULL;
cl_uint num_devices = 0;
cl_device_id *devices = NULL;
cl_device_id device = NULL;
_crcword = 0;
conversion = 1.0f;
_deviceId = deviceId;
_openTest = test;
context_ = 0;
cmd_queue_ = 0;
program_ = 0;
kernel_ = 0;
inBuffer_ = 0;
outBuffer_ = 0;
tableBuffer_ = 0;
keyBuffer_ = 0;
blockSize_ = 1024;
maxIterations = 50;
bufSize_ = 5592320 * sizeof(cl_uint4);
error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
if (0 < numPlatforms) {
cl_platform_id *platforms = new cl_platform_id[numPlatforms];
error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
platform = platforms[_platformIndex];
char pbuf[100];
error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
NULL);
num_devices = 0;
/* Get the number of requested devices */
error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
&num_devices);
delete platforms;
}
/*
* If we could find our platform, use it. If not, die as we need the AMD
* platform for these extensions.
*/
CHECK_RESULT(platform == 0,
"Couldn't find platform with GPU devices, cannot proceed");
devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
CHECK_RESULT(devices == 0, "no devices");
/* Get the requested device */
error_ =
_wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
device = devices[_deviceId];
context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
&error_);
CHECK_RESULT(context_ == 0, "clCreateContext failed");
char charbuf[1024];
size_t retsize;
error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
charbuf, &retsize);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
// Increase iterations for devices with many CUs
error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(size_t), &numCUs, &retsize);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
maxIterations *= (unsigned int)(1 + 10 * numCUs / 20);
cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
inBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, bufSize_,
NULL, &error_);
CHECK_RESULT(inBuffer_ == 0, "clCreateBuffer(inBuffer) failed");
outBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, bufSize_,
NULL, &error_);
CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
tableBuffer_ =
_wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, 5120, NULL, &error_);
CHECK_RESULT(tableBuffer_ == 0, "clCreateBuffer(tableBuffer) failed");
keyBuffer_ =
_wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, 240, NULL, &error_);
CHECK_RESULT(keyBuffer_ == 0, "clCreateBuffer(keyBuffer) failed");
if (_openTest == 0) {
program_ = _wrapper->clCreateProgramWithSource(
context_, 1, (const char **)&aes256_kernel, NULL, &error_);
CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
testDescString += "orig";
} else {
program_ = _wrapper->clCreateProgramWithSource(
context_, 1, (const char **)&aes256_kernel2, NULL, &error_);
CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
testDescString += " new";
}
const char *buildOps = NULL;
error_ = _wrapper->clBuildProgram(program_, 1, &device, buildOps, NULL, NULL);
if (error_ != CL_SUCCESS) {
cl_int intError;
char log[16384];
intError =
_wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
16384 * sizeof(char), log, NULL);
printf("Build error -> %s\n", log);
CHECK_RESULT(0, "clBuildProgram failed");
}
kernel_ = _wrapper->clCreateKernel(program_, "CryptThread", &error_);
CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
cl_uint rounds = 14;
error_ =
_wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&inBuffer_);
error_ =
_wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_mem), (void *)&outBuffer_);
error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_mem),
(void *)&tableBuffer_);
error_ =
_wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_mem), (void *)&keyBuffer_);
error_ =
_wrapper->clSetKernelArg(kernel_, 4, sizeof(cl_uint), (void *)&rounds);
setData(inBuffer_, 0xdeadbeef);
setData(outBuffer_, 0xdeadbeef);
}
void OCLPerfAES256::run(void) {
int global = bufSize_ / sizeof(cl_uint4);
int local = 64;
size_t global_work_size[1] = {(size_t)global};
size_t local_work_size[1] = {(size_t)local};
CPerfCounter timer;
timer.Reset();
timer.Start();
for (unsigned int i = 0; i < maxIterations; i++) {
error_ = _wrapper->clEnqueueNDRangeKernel(
cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
(const size_t *)local_work_size, 0, NULL, NULL);
}
CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
_wrapper->clFinish(cmd_queue_);
timer.Stop();
double sec = timer.GetElapsedTime();
// No idea what data should be in here
// checkData(outBuffer_);
// Compute GB/s
double perf =
((double)bufSize_ * (double)maxIterations * (double)(1e-09)) / sec;
_perfInfo = (float)perf;
}
unsigned int OCLPerfAES256::close(void) {
_wrapper->clFinish(cmd_queue_);
if (inBuffer_) {
error_ = _wrapper->clReleaseMemObject(inBuffer_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseMemObject(inBuffer_) failed");
}
if (outBuffer_) {
error_ = _wrapper->clReleaseMemObject(outBuffer_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseMemObject(outBuffer_) failed");
}
if (tableBuffer_) {
error_ = _wrapper->clReleaseMemObject(tableBuffer_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseMemObject(tableBuffer_) failed");
}
if (keyBuffer_) {
error_ = _wrapper->clReleaseMemObject(keyBuffer_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseMemObject(keyBuffer_) failed");
}
if (kernel_) {
error_ = _wrapper->clReleaseKernel(kernel_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
}
if (program_) {
error_ = _wrapper->clReleaseProgram(program_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
}
if (cmd_queue_) {
error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseCommandQueue failed");
}
if (context_) {
error_ = _wrapper->clReleaseContext(context_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
}
return _crcword;
}
@@ -0,0 +1,58 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_AES256_H_
#define _OCL_AES256_H_
#include "OCLTestImp.h"
class OCLPerfAES256 : public OCLTestImp {
public:
OCLPerfAES256();
virtual ~OCLPerfAES256();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
std::string shader_;
void setData(cl_mem buffer, unsigned int data);
void checkData(cl_mem buffer);
cl_context context_;
cl_command_queue cmd_queue_;
cl_program program_;
cl_kernel kernel_;
cl_mem inBuffer_;
cl_mem outBuffer_;
cl_mem tableBuffer_;
cl_mem keyBuffer_;
cl_int error_;
unsigned int width_;
unsigned int bufSize_;
unsigned int blockSize_;
unsigned int maxIterations;
size_t numCUs;
};
#endif // _OCL_AES256_H_
@@ -0,0 +1,817 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerfAtomicSpeed.h"
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include "CL/cl.h"
#include "OCLPerfAtomicSpeedKernels.h"
#include "Timer.h"
// Quiet pesky warnings
#ifdef WIN_OS
#define SNPRINTF sprintf_s
#else
#define SNPRINTF snprintf
#endif
// Define the test suite tests.
testOCLPerfAtomicSpeedStruct testOCLPerfAtomicSpeedList[] = {
{LocalHistogram, 1},
{LocalHistogram, 2},
{LocalHistogram, 4},
{GlobalHistogram, 1},
{GlobalHistogram, 2},
{GlobalHistogram, 4},
{Global4Histogram, 1},
{Global4Histogram, 2},
{Global4Histogram, 4},
{LocalReductionNoAtomics, 1},
{LocalReductionNoAtomics, 2},
{LocalReductionNoAtomics, 4},
{LocalReductionAtomics, 1},
{LocalReductionAtomics, 2},
{LocalReductionAtomics, 4},
{Local4ReductionNoAtomics, 1},
{Local4ReductionNoAtomics, 2},
{Local4ReductionNoAtomics, 4},
/* {Local4ReductionAtomics, 1},
{Local4ReductionAtomics, 2},
{Local4ReductionAtomics, 4},*/
{GlobalWGReduction, 1},
{GlobalWGReduction, 2},
{GlobalWGReduction, 4},
{GlobalAllToZeroReduction, 1},
{GlobalAllToZeroReduction, 2},
{GlobalAllToZeroReduction, 4},
{Global4WGReduction, 1},
{Global4WGReduction, 2},
{Global4WGReduction, 4},
{Global4AllToZeroReduction, 1},
{Global4AllToZeroReduction, 2},
{Global4AllToZeroReduction, 4},
};
///////////////////////////////////////////////////////////////////////////////
// OCLPerfAtomicSpeed implementation.
///////////////////////////////////////////////////////////////////////////////
OCLPerfAtomicSpeed::OCLPerfAtomicSpeed() {
_atomicsSupported = false;
_dataSizeTooBig = false;
_numSubTests =
sizeof(testOCLPerfAtomicSpeedList) / sizeof(testOCLPerfAtomicSpeedStruct);
_numLoops = 10;
_nCurrentInputScale = 1;
_maxMemoryAllocationSize = 0;
_input = NULL;
_output = NULL;
_inputBuffer = NULL;
_outputBuffer = NULL;
_workgroupSize = 256;
_programs.clear();
_kernels.clear();
}
OCLPerfAtomicSpeed::~OCLPerfAtomicSpeed() {}
void OCLPerfAtomicSpeed::open(unsigned int test, char *units,
double &conversion, unsigned int deviceId) {
cl_uint numPlatforms;
cl_platform_id platform = NULL;
cl_uint num_devices = 0;
cl_device_id *devices = NULL;
cl_int status = CL_SUCCESS;
device = NULL;
_crcword = 0;
conversion = 1.0f;
_deviceId = deviceId;
_openTest = test;
_cpuReductionSum = 0;
_nCurrentInputScale = testOCLPerfAtomicSpeedList[_openTest].inputScale;
AtomicType atomicType = testOCLPerfAtomicSpeedList[_openTest].atomicType;
// Setup stuff...
setupHistogram();
calculateHostBin();
context_ = 0;
cmd_queue_ = 0;
error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
if (0 < numPlatforms) {
cl_platform_id *platforms = new cl_platform_id[numPlatforms];
error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
// Get last for default
#if 0
platform = platforms[numPlatforms-1];
for (unsigned i = 0; i < numPlatforms; ++i) {
#endif
char pbuf[100];
error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
NULL);
#if 0
if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
platform = platforms[i];
break;
}
#endif
num_devices = 0;
/* Get the number of requested devices */
error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
&num_devices);
// Runtime returns an error when no GPU devices are present instead of just
// returning 0 devices
// CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
// Choose platform with GPU devices
if (num_devices > 0) {
#if 0
if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
isAMD = true;
}
#endif
platform = platforms[_platformIndex];
}
#if 0
}
#endif
delete platforms;
}
/*
* If we could find our platform, use it. If not, die as we need the AMD
* platform for these extensions.
*/
CHECK_RESULT(platform == 0,
"Couldn't find platform with GPU devices, cannot proceed");
devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
CHECK_RESULT(devices == 0, "no devices");
/* Get the requested device */
error_ =
_wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
device = devices[_deviceId];
context_ = _wrapper->clCreateContext(NULL, 1, &device, NULL, NULL, &error_);
CHECK_RESULT(context_ == 0, "clCreateContext failed");
char charbuf[1024];
size_t retsize;
error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
charbuf, &retsize);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
// Global memory size
error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
sizeof(cl_ulong),
&_maxMemoryAllocationSize, NULL);
CHECK_RESULT(error_ != CL_SUCCESS,
"clGetDeviceIDs(CL_DEVICE_GLOBAL_MEM_SIZE) failed");
// Check that the test size is not too big for the current GPU.
_dataSizeTooBig = false;
cl_ulong tenMB = 1024 * 10240;
if (_inputNBytes >= (_maxMemoryAllocationSize - tenMB)) {
_dataSizeTooBig = true;
return;
}
char *p = strstr(charbuf, "cl_khr_global_int32_base_atomics");
char *p2 = strstr(charbuf, "cl_khr_local_int32_base_atomics");
_atomicsSupported = false;
if (p || p2) _atomicsSupported = true;
// Verify atomics are supported.
if (!_atomicsSupported) return;
cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
// Create buffers...
_inputBuffer =
clCreateBuffer(context_, CL_MEM_READ_ONLY, _inputNBytes, 0, &status);
CHECK_RESULT(status, "clCreateBuffer failed. (inputBuffer)");
// Create the programs/kernels for the current test type.
CreateKernels(atomicType);
_nThreadsPerGroup = _workgroupSize;
_nGroups = _nThreads / _nThreadsPerGroup;
_outputNBytes = _nGroups * NBINS * sizeof(cl_uint);
if (IsReduction(atomicType)) _outputNBytes = _inputNBytes;
_output = (cl_uint *)malloc(_outputNBytes);
if (0 == _output) {
_dataSizeTooBig = true;
return;
}
// Create output Buffer
_outputBuffer =
clCreateBuffer(context_, CL_MEM_READ_WRITE, _outputNBytes, 0, &status);
CHECK_RESULT(status, "clCreateBuffer failed. (outputBuffer)");
}
// Create the programs/kernels for the current test type.
void OCLPerfAtomicSpeed::CreateKernels(const AtomicType atomicType) {
char log[16384];
cl_kernel kernel_;
cl_program program_;
char buildOptions[1000];
cl_int status = CL_SUCCESS;
SNPRINTF(buildOptions, sizeof(buildOptions),
"-D NBINS=%d -D BITS_PER_PIX=%d -D NBANKS=%d", NBINS, BITS_PER_PIX,
NBANKS);
// Create the programs.
switch (atomicType) {
case LocalHistogram:
program_ = _wrapper->clCreateProgramWithSource(
context_, 1, (const char **)&local_atomics_histogram, NULL, &error_);
CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
_programs.push_back(program_);
program_ = _wrapper->clCreateProgramWithSource(
context_, 1, (const char **)&local_atomics_reduce, NULL, &error_);
CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
_programs.push_back(program_);
break;
case LocalReductionNoAtomics:
program_ = _wrapper->clCreateProgramWithSource(
context_, 1, (const char **)&local_reduction, NULL, &error_);
CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
_programs.push_back(program_);
break;
case Local4ReductionNoAtomics:
program_ = _wrapper->clCreateProgramWithSource(
context_, 1, (const char **)&local_vec4_reduction, NULL, &error_);
CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
_programs.push_back(program_);
break;
case LocalReductionAtomics:
program_ = _wrapper->clCreateProgramWithSource(
context_, 1, (const char **)&local_atomics_reduction, NULL, &error_);
CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
_programs.push_back(program_);
break;
case Local4ReductionAtomics:
program_ = _wrapper->clCreateProgramWithSource(
context_, 1, (const char **)&local_vec4_atomics_reduction, NULL,
&error_);
CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
_programs.push_back(program_);
break;
case GlobalHistogram:
case Global4Histogram:
program_ = _wrapper->clCreateProgramWithSource(
context_, 1, (const char **)&global_atomics_histogram, NULL, &error_);
CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
_programs.push_back(program_);
break;
case GlobalWGReduction:
case Global4WGReduction:
program_ = _wrapper->clCreateProgramWithSource(
context_, 1, (const char **)&global_atomics_sum_reduction_workgroup,
NULL, &error_);
CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
_programs.push_back(program_);
break;
case GlobalAllToZeroReduction:
case Global4AllToZeroReduction:
program_ = _wrapper->clCreateProgramWithSource(
context_, 1, (const char **)&global_atomics_sum_reduction_all_to_zero,
NULL, &error_);
CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
_programs.push_back(program_);
break;
default:
CHECK_RESULT(true, "Atomic type not supported (clCreateProgram)");
}
// Build the programs.
for (size_t i = 0; i < _programs.size(); i++) {
error_ = _wrapper->clBuildProgram(_programs[i], 1, &device, buildOptions,
NULL, NULL);
if (error_ != CL_SUCCESS) {
status = _wrapper->clGetProgramBuildInfo(_programs[i], device,
CL_PROGRAM_BUILD_LOG,
16384 * sizeof(char), log, NULL);
printf("Build error -> %s\n", log);
CHECK_RESULT(0, "clBuildProgram failed");
}
}
switch (atomicType) {
case LocalHistogram:
kernel_ = _wrapper->clCreateKernel(_programs[0],
"local_atomics_histogram", &error_);
CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
_kernels.push_back(kernel_);
kernel_ = _wrapper->clCreateKernel(_programs[1], "local_atomics_reduce",
&error_);
CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
_kernels.push_back(kernel_);
break;
case LocalReductionNoAtomics:
case Local4ReductionNoAtomics:
case LocalReductionAtomics:
case Local4ReductionAtomics:
kernel_ =
_wrapper->clCreateKernel(_programs[0], "local_reduction", &error_);
CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
_kernels.push_back(kernel_);
break;
case GlobalHistogram:
case Global4Histogram:
kernel_ = _wrapper->clCreateKernel(_programs[0],
"global_atomics_histogram", &error_);
CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
_kernels.push_back(kernel_);
break;
case GlobalWGReduction:
case Global4WGReduction:
kernel_ = _wrapper->clCreateKernel(
_programs[0], "global_atomics_sum_reduction_workgroup", &error_);
CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
_kernels.push_back(kernel_);
break;
case GlobalAllToZeroReduction:
case Global4AllToZeroReduction:
kernel_ = _wrapper->clCreateKernel(
_programs[0], "global_atomics_sum_reduction_all_to_zero", &error_);
CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
_kernels.push_back(kernel_);
break;
default:
CHECK_RESULT(true, "Atomic type not supported (clCreateKernel)");
}
}
// Sets the kernel arguments based on the current test type.
void OCLPerfAtomicSpeed::SetKernelArguments(const AtomicType atomicType) {
int Arg = 0;
int localSize = 0;
int itemsPerThread = 1;
cl_int status = CL_SUCCESS;
switch (atomicType) {
case LocalHistogram:
// Set arguments for the local atomics histogram kernel
status = _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
(void *)&_inputBuffer);
CHECK_RESULT(status, "clSetKernelArg failed. (inputBuffer)");
status |= _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
(void *)&_outputBuffer);
CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)");
status |= _wrapper->clSetKernelArg(_kernels[0], Arg++,
sizeof(_n4VectorsPerThread),
(void *)&_n4VectorsPerThread);
CHECK_RESULT(status, "clSetKernelArg failed. (n4VectorsPerThread)");
// Set arguments for the local atomics reduce kernel
Arg = 0;
status |= _wrapper->clSetKernelArg(_kernels[1], Arg++, sizeof(cl_mem),
(void *)&_outputBuffer);
CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)");
status |= _wrapper->clSetKernelArg(_kernels[1], Arg++, sizeof(_nGroups),
(void *)&_nGroups);
CHECK_RESULT(status, "clSetKernelArg failed. (nGroups)");
break;
case LocalReductionAtomics:
case LocalReductionNoAtomics:
case Local4ReductionNoAtomics:
case Local4ReductionAtomics:
status = _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
(void *)&_inputBuffer);
CHECK_RESULT(status, "clSetKernelArg failed. (inputBuffer)");
status |= _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
(void *)&_outputBuffer);
CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)");
localSize = DEFAULT_WG_SIZE * sizeof(cl_uint);
if ((Local4ReductionNoAtomics == atomicType) ||
(Local4ReductionAtomics == atomicType))
localSize *= 4;
status = _wrapper->clSetKernelArg(_kernels[0], Arg++, localSize, NULL);
CHECK_RESULT(status, "clSetKernelArg failed. (local memory)");
break;
case GlobalHistogram:
case Global4Histogram:
case GlobalWGReduction:
case Global4WGReduction:
case GlobalAllToZeroReduction:
case Global4AllToZeroReduction:
// Set arguments for the global atomics histogram kernel
if ((Global4Histogram == atomicType) ||
(Global4WGReduction == atomicType) ||
(Global4AllToZeroReduction == atomicType))
itemsPerThread = 4;
status = _wrapper->clSetKernelArg(
_kernels[0], Arg++, sizeof(itemsPerThread), (void *)&itemsPerThread);
CHECK_RESULT(status, "clSetKernelArg failed. (itemsPerThread)");
status = _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
(void *)&_inputBuffer);
CHECK_RESULT(status, "clSetKernelArg failed. (inputBuffer)");
status |= _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
(void *)&_outputBuffer);
CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)");
break;
default:
CHECK_RESULT(true, "Atomic type not supported (clSetKernelArg)");
}
}
// Since we write multiple times to the output in global atomics, need to
// reset the content every time.
void OCLPerfAtomicSpeed::ResetGlobalOutput() {
cl_int status;
memset(_output, 0, _outputNBytes);
status =
_wrapper->clEnqueueWriteBuffer(cmd_queue_, _outputBuffer, CL_TRUE, 0,
_outputNBytes, _output, 0, NULL, NULL);
CHECK_RESULT(status, "clEnqueueWriteBuffer failed.");
status = _wrapper->clFinish(cmd_queue_);
CHECK_RESULT(status, "clFlush failed.");
}
// Run the local histogram kernels.
void OCLPerfAtomicSpeed::RunLocalHistogram() {
cl_uint status;
cl_event events[2];
size_t globalThreads[3] = {1};
size_t localThreads[3] = {1};
size_t globalThreadsReduce = NBINS;
size_t localThreadsReduce = _nThreadsPerGroup;
globalThreads[0] = _nThreads;
localThreads[0] = _nThreadsPerGroup;
status = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, _kernels[0], 1, NULL,
globalThreads, localThreads, 0,
NULL, &events[0]);
CHECK_RESULT(status, "clEnqueueNDRangeKernel failed. (histogram)");
status = _wrapper->clEnqueueNDRangeKernel(
cmd_queue_, _kernels[1], 1, NULL, &globalThreadsReduce,
&localThreadsReduce, 1, &events[0], &events[1]);
CHECK_RESULT(status, "clEnqueueNDRangeKernel failed. (reduce)");
status = _wrapper->clFinish(cmd_queue_);
CHECK_RESULT(status, "clFlush failed.");
status = _wrapper->clWaitForEvents(1, &events[0]);
status |= _wrapper->clWaitForEvents(1, &events[1]);
CHECK_RESULT(status, "clWaitForEvents failed.");
}
// Run the local reduction kernel.
void OCLPerfAtomicSpeed::RunLocalReduction(const AtomicType atomicType) {
cl_uint status;
size_t globalThreads[3] = {1};
size_t localThreads[3] = {1};
globalThreads[0] = _inputNBytes / sizeof(cl_uint) / 2;
localThreads[0] = _nThreadsPerGroup;
if ((Local4ReductionNoAtomics == atomicType) ||
(Local4ReductionAtomics == atomicType))
globalThreads[0] /= 4;
status = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, _kernels[0], 1, NULL,
globalThreads, localThreads, 0,
NULL, NULL);
CHECK_RESULT(status, "clEnqueueNDRangeKernel failed. (reduction)");
status = _wrapper->clFinish(cmd_queue_);
CHECK_RESULT(status, "clFlush failed.");
}
// Run the global histogram kernel.
void OCLPerfAtomicSpeed::RunGlobalHistogram(AtomicType atomicType) {
cl_uint status;
size_t globalThreads[3] = {1};
size_t localThreads[3] = {1};
globalThreads[0] = _inputNBytes / sizeof(cl_uint);
localThreads[0] = _nThreadsPerGroup;
if ((Global4Histogram == atomicType) || (Global4WGReduction == atomicType) ||
(Global4AllToZeroReduction == atomicType))
globalThreads[0] /= 4;
status = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, _kernels[0], 1, NULL,
globalThreads, localThreads, 0,
NULL, NULL);
CHECK_RESULT(status, "clEnqueueNDRangeKernel failed.");
status = _wrapper->clFinish(cmd_queue_);
CHECK_RESULT(status, "clFlush failed.");
}
// Run the AtomicSpeed logic.
void OCLPerfAtomicSpeed::run() {
int Arg = 0;
cl_uint status;
AtomicType atomicType = testOCLPerfAtomicSpeedList[_openTest].atomicType;
// Verify atomics are supported.
if ((!_atomicsSupported) || (_dataSizeTooBig)) return;
// Write data to the GPU
status = _wrapper->clEnqueueWriteBuffer(cmd_queue_, _inputBuffer, CL_FALSE, 0,
_inputNBytes, _input, 0, NULL, NULL);
CHECK_RESULT(status, "clEnqueueWriteBuffer failed. (inputBuffer)");
status = _wrapper->clFlush(cmd_queue_);
CHECK_RESULT(status, "clFlush failed.");
// Set the current arguments based on the test type.
SetKernelArguments(atomicType);
// Run the kernels.
CPerfCounter timer;
double totalTime = 0.0f;
for (unsigned int k = 0; k < _numLoops + 1; k++) {
// Since we run multiple times using global atomics the output
// would get accumulated therefore first clean it.
ResetGlobalOutput();
timer.Reset();
timer.Start();
switch (atomicType) {
case LocalHistogram:
RunLocalHistogram();
break;
case LocalReductionAtomics:
case LocalReductionNoAtomics:
case Local4ReductionNoAtomics:
case Local4ReductionAtomics:
RunLocalReduction(atomicType);
break;
case GlobalHistogram:
case Global4Histogram:
case GlobalWGReduction:
case Global4WGReduction:
case GlobalAllToZeroReduction:
case Global4AllToZeroReduction:
RunGlobalHistogram(atomicType);
break;
default:
CHECK_RESULT(true, "Atomic type not supported");
}
timer.Stop();
// Don't count the warm-up
if (0 != k) totalTime += timer.GetElapsedTime();
}
// Read the results back to the CPU - Only do it for the last run
// of the test instead of for each iteration of _numLoops.
status = _wrapper->clEnqueueReadBuffer(cmd_queue_, _outputBuffer, CL_FALSE, 0,
_outputNBytes, _output, 0, NULL, NULL);
CHECK_RESULT(status, "clEnqueueReadBuffer failed.");
status = _wrapper->clFinish(cmd_queue_);
CHECK_RESULT(status, "clFlush failed.");
// Print the results.
PrintResults(atomicType, totalTime);
// Check the results for the current test.
_errorFlag = !(VerifyResults(atomicType));
}
// Compare the results and see if they match
bool OCLPerfAtomicSpeed::VerifyResults(const AtomicType atomicType) {
cl_uint i = 0;
bool flag = true;
cl_uint calculatedValue = 0;
cl_uint reductionElementCount = 0;
switch (atomicType) {
case LocalHistogram:
case GlobalHistogram:
case Global4Histogram:
for (i = 0; i < NBINS; ++i) {
if (_cpuhist[i] != _output[i]) {
flag = false;
break;
}
}
break;
case LocalReductionAtomics:
case LocalReductionNoAtomics:
case Local4ReductionNoAtomics:
case Local4ReductionAtomics:
case GlobalWGReduction:
case Global4WGReduction:
reductionElementCount =
_inputNBytes / sizeof(cl_uint) / _nThreadsPerGroup;
for (i = 0; i < reductionElementCount; i++) {
calculatedValue += _output[i];
}
flag = (calculatedValue == _cpuReductionSum);
break;
case GlobalAllToZeroReduction:
case Global4AllToZeroReduction:
flag = (_output[0] == _cpuReductionSum);
break;
default:
CHECK_RESULT_NO_RETURN(true, "Atomic type not supported (VerifyResults)");
return false;
}
if (!flag) printf("WRONG VALUES!!!!!");
return flag;
}
unsigned int OCLPerfAtomicSpeed::close() {
size_t i = 0;
for (; i < _kernels.size(); i++) {
error_ = _wrapper->clReleaseKernel(_kernels[i]);
}
for (; i < _programs.size(); i++) {
error_ = _wrapper->clReleaseProgram(_programs[i]);
}
if (_inputBuffer) {
error_ = clReleaseMemObject(_inputBuffer);
CHECK_RESULT_NO_RETURN(error_, "clReleaseMemObject failed.(inputBuffer )");
}
if (_outputBuffer) {
error_ = clReleaseMemObject(_outputBuffer);
CHECK_RESULT_NO_RETURN(error_, "clReleaseMemObject failed.(outputBuffer)");
}
if (cmd_queue_) {
error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseCommandQueue failed");
}
if (context_) {
error_ = _wrapper->clReleaseContext(context_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
}
// Free host memory.
free(_input);
free(_output);
// Reset everything.
_kernels.clear();
_programs.clear();
_inputBuffer = NULL;
_outputBuffer = NULL;
cmd_queue_ = NULL;
context_ = NULL;
_input = NULL;
_output = NULL;
return _crcword;
}
/* Helper functions */
void OCLPerfAtomicSpeed::calculateHostBin() {
// compute CPU histogram
cl_int *p = (cl_int *)_input;
memset(_cpuhist, 0, NBINS * sizeof(cl_uint));
_cpuReductionSum = 0;
for (unsigned int i = 0; i < _inputNBytes / sizeof(cl_uint); i++) {
_cpuhist[(p[i] >> 24) & 0xff]++;
_cpuhist[(p[i] >> 16) & 0xff]++;
_cpuhist[(p[i] >> 8) & 0xff]++;
_cpuhist[(p[i] >> 0) & 0xff]++;
_cpuReductionSum += ((p[i] >> 24) & 0x3) + ((p[i] >> 16) & 0x3) +
((p[i] >> 8) & 0x3) + ((p[i] >> 0) & 0x3);
}
}
void OCLPerfAtomicSpeed::setupHistogram() {
cl_int status = 0;
_nThreads = 64 * 1024;
#if defined(_WIN32) && !defined(_WIN64)
_n4Vectors = 1024 * 1024;
#else
_n4Vectors = 2048 * 2048;
#endif
_n4Vectors *= _nCurrentInputScale;
_n4VectorsPerThread = _n4Vectors / _nThreads;
_inputNBytes = _n4Vectors * sizeof(cl_uint4);
_input = (cl_uint *)malloc(_inputNBytes);
if (0 == _input) {
_dataSizeTooBig = true;
return;
}
// random initialization of input
time_t ltime;
time(&ltime);
cl_uint a = (cl_uint)ltime, b = (cl_uint)ltime;
cl_uint *p = (cl_uint *)_input;
for (unsigned int i = 0; i < _inputNBytes / sizeof(cl_uint); i++)
p[i] = (b = (a * (b & 65535)) + (b >> 16));
}
// Print the results of the current test.
void OCLPerfAtomicSpeed::PrintResults(const AtomicType atomicType,
double totalTime) {
char buf[500];
char sAtomicType[100];
double inputInGB = (double)_inputNBytes * (double)(1e-09);
// each cl_uint in _inputNBytes contributes 4 items.
double totalHistogramDataInGB = (double)inputInGB * 4;
double perf = totalTime / _numLoops;
switch (atomicType) {
case LocalHistogram:
SNPRINTF(sAtomicType, sizeof(sAtomicType), "Local histogram");
break;
case GlobalHistogram:
SNPRINTF(sAtomicType, sizeof(sAtomicType), "Global histogram");
break;
case Global4Histogram:
SNPRINTF(sAtomicType, sizeof(sAtomicType), "Global vec 4 histogram");
break;
case LocalReductionNoAtomics:
SNPRINTF(sAtomicType, sizeof(sAtomicType), "Local reduction NO atomics");
break;
case Local4ReductionNoAtomics:
SNPRINTF(sAtomicType, sizeof(sAtomicType),
"Local vec 4 reduction NO atomics");
break;
case LocalReductionAtomics:
SNPRINTF(sAtomicType, sizeof(sAtomicType),
"Local reduction with atomics");
break;
case Local4ReductionAtomics:
SNPRINTF(sAtomicType, sizeof(sAtomicType),
"Local vec 4 reduction with atomics");
break;
case GlobalWGReduction:
SNPRINTF(sAtomicType, sizeof(sAtomicType), "Global work-group reduction");
break;
case Global4WGReduction:
SNPRINTF(sAtomicType, sizeof(sAtomicType),
"Global vec 4 work-group reduction");
break;
case GlobalAllToZeroReduction:
SNPRINTF(sAtomicType, sizeof(sAtomicType),
"Global all to zero reduction");
break;
case Global4AllToZeroReduction:
SNPRINTF(sAtomicType, sizeof(sAtomicType),
"Global vec 4 all to zero reduction");
break;
default:
CHECK_RESULT(true, "Atomic type not supported (PrintResults)");
}
SNPRINTF(buf, sizeof(buf), "%45s: Input [%.3f GB], Time [%.3f sec]: GB/s",
sAtomicType, totalHistogramDataInGB, perf);
_perfInfo = (float)(totalHistogramDataInGB / perf);
testDescString = buf;
}
bool OCLPerfAtomicSpeed::IsReduction(const AtomicType atomicType) {
return ((atomicType >= LocalReductionNoAtomics) &&
(atomicType <= GlobalAllToZeroReduction));
}
@@ -0,0 +1,119 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_AtomicSpeed_H_
#define _OCL_AtomicSpeed_H_
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "OCLTestImp.h"
#define DEFAULT_WG_SIZE 256
#define NBINS 256
#define BITS_PER_PIX 8
#define NBANKS 16
// Define the atomic type to test.
enum AtomicType {
LocalHistogram = 0,
GlobalHistogram,
Global4Histogram,
LocalReductionNoAtomics,
Local4ReductionNoAtomics,
LocalReductionAtomics,
Local4ReductionAtomics,
GlobalWGReduction,
Global4WGReduction,
GlobalAllToZeroReduction,
Global4AllToZeroReduction,
};
typedef struct {
AtomicType atomicType;
int inputScale;
} testOCLPerfAtomicSpeedStruct;
// Define the OCLPerfAtomicSpeed class.
class OCLPerfAtomicSpeed : public OCLTestImp {
public:
OCLPerfAtomicSpeed();
virtual ~OCLPerfAtomicSpeed();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
cl_context context_;
cl_command_queue cmd_queue_;
std::vector<cl_program> _programs;
std::vector<cl_kernel> _kernels;
cl_device_id device;
bool _atomicsSupported;
bool _dataSizeTooBig;
cl_uint _numLoops;
// Histogram related stuff...
private:
cl_ulong _maxMemoryAllocationSize;
cl_uint _inputNBytes;
cl_uint _outputNBytes;
cl_uint _nCurrentInputScale;
cl_uint _workgroupSize;
// cl_uint nLoops;
cl_uint _nThreads;
cl_uint _nThreadsPerGroup;
cl_uint _nGroups;
cl_uint _n4Vectors;
cl_uint _n4VectorsPerThread;
cl_uint _nBins;
cl_uint _nBytesLDSPerGrp;
cl_uint* _input;
cl_uint* _output;
cl_mem _inputBuffer;
cl_mem _outputBuffer;
cl_uint _cpuhist[NBINS];
cl_uint _cpuReductionSum;
void calculateHostBin();
void setupHistogram();
bool VerifyResults(const AtomicType atomicType);
void ResetGlobalOutput();
// Methods that does the actual NDRange.
void RunLocalHistogram();
void RunLocalReduction(const AtomicType atomicType);
void RunGlobalHistogram(const AtomicType atomicType);
void CreateKernels(const AtomicType atomicType);
bool IsReduction(const AtomicType atomicType);
void SetKernelArguments(const AtomicType atomicType);
void PrintResults(const AtomicType atomicType, double totalTime);
};
#endif // _OCL_AtomicSpeed_H_
@@ -0,0 +1,509 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerfAtomicSpeed20.h"
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include "CL/cl.h"
#include "OCLPerfAtomicSpeed20Kernels.h"
#include "Timer.h"
// Quiet pesky warnings
#ifdef WIN_OS
#define SNPRINTF sprintf_s
#else
#define SNPRINTF snprintf
#endif
// Define the test suite tests.
testOCLPerfAtomicSpeed20Struct testOCLPerfAtomicSpeed20List[] = {
{GlobalWGReduction, 1}, {GlobalWGReduction, 2},
{GlobalWGReduction, 4}, {GlobalAllToZeroReduction, 1},
{GlobalAllToZeroReduction, 2}, {GlobalAllToZeroReduction, 4},
{Global4WGReduction, 1}, {Global4WGReduction, 2},
{Global4WGReduction, 4}, {Global4AllToZeroReduction, 1},
{Global4AllToZeroReduction, 2}, {Global4AllToZeroReduction, 4},
};
///////////////////////////////////////////////////////////////////////////////
// OCLPerfAtomicSpeed20 implementation.
///////////////////////////////////////////////////////////////////////////////
OCLPerfAtomicSpeed20::OCLPerfAtomicSpeed20() {
_atomicsSupported = false;
_dataSizeTooBig = false;
_numSubTests = sizeof(testOCLPerfAtomicSpeed20List) /
sizeof(testOCLPerfAtomicSpeed20Struct);
_numLoops = 10;
_nCurrentInputScale = 1;
_maxMemoryAllocationSize = 0;
_input = NULL;
_output = NULL;
_inputBuffer = NULL;
_outputBuffer = NULL;
skip_ = false;
_workgroupSize = 256;
_programs.clear();
_kernels.clear();
}
OCLPerfAtomicSpeed20::~OCLPerfAtomicSpeed20() {}
void OCLPerfAtomicSpeed20::open(unsigned int test, char *units,
double &conversion, unsigned int deviceId) {
error_ = CL_SUCCESS;
OCLTestImp::open(test, units, conversion, deviceId);
CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
program_ = 0;
kernel_ = 0;
#if defined(CL_VERSION_2_0)
cl_device_id device;
cl_int status = CL_SUCCESS;
conversion = 1.0f;
_openTest = test;
_cpuReductionSum = 0;
_nCurrentInputScale = testOCLPerfAtomicSpeed20List[_openTest].inputScale;
AtomicType atomicType = testOCLPerfAtomicSpeed20List[_openTest].atomicType;
// Setup stuff...
setupHistogram();
calculateHostBin();
device = devices_[_deviceId];
cmd_queue_ = cmdQueues_[_deviceId];
char charbuf[1024];
size_t retsize;
error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
charbuf, &retsize);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
// Global memory size
error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
sizeof(cl_ulong),
&_maxMemoryAllocationSize, NULL);
CHECK_RESULT(error_ != CL_SUCCESS,
"clGetDeviceInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE) failed");
// Check that the test size is not too big for the current GPU.
_dataSizeTooBig = false;
cl_ulong tenMB = 1024 * 10240;
if (_inputNBytes >= (_maxMemoryAllocationSize - tenMB)) {
_dataSizeTooBig = true;
return;
}
char *p = strstr(charbuf, "cl_khr_global_int32_base_atomics");
_atomicsSupported = false;
if (p) _atomicsSupported = true;
// Verify atomics are supported.
if (!_atomicsSupported) return;
cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
// Create buffers...
_inputBuffer =
clCreateBuffer(context_, CL_MEM_READ_ONLY, _inputNBytes, 0, &status);
CHECK_RESULT(status, "clCreateBuffer failed. (inputBuffer)");
// Create the programs/kernels for the current test type.
CreateKernels(atomicType);
_nThreadsPerGroup = _workgroupSize;
_nGroups = _nThreads / _nThreadsPerGroup;
_outputNBytes = _inputNBytes;
_output = (cl_uint *)malloc(_outputNBytes);
if (0 == _output) {
_dataSizeTooBig = true;
return;
}
// Create output Buffer
_outputBuffer =
clCreateBuffer(context_, CL_MEM_READ_WRITE, _outputNBytes, 0, &status);
CHECK_RESULT(status, "clCreateBuffer failed. (outputBuffer)");
#else
skip_ = true;
testDescString = "OpenCL verion < 2.0. Test Skipped.";
return;
#endif
}
// Create the programs/kernels for the current test type.
void OCLPerfAtomicSpeed20::CreateKernels(const AtomicType atomicType) {
char log[16384];
cl_kernel kernel_;
cl_program program_;
char buildOptions[1000];
cl_int status = CL_SUCCESS;
cl_device_id device = devices_[_deviceId];
SNPRINTF(buildOptions, sizeof(buildOptions),
"-cl-std=CL2.0 -D NBINS=%d -D BITS_PER_PIX=%d -D NBANKS=%d", NBINS,
BITS_PER_PIX, NBANKS);
// Create the programs.
switch (atomicType) {
case GlobalWGReduction:
case Global4WGReduction:
program_ = _wrapper->clCreateProgramWithSource(
context_, 1, (const char **)&global_atomics_sum_reduction_workgroup,
NULL, &error_);
CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
_programs.push_back(program_);
break;
case GlobalAllToZeroReduction:
case Global4AllToZeroReduction:
program_ = _wrapper->clCreateProgramWithSource(
context_, 1, (const char **)&global_atomics_sum_reduction_all_to_zero,
NULL, &error_);
CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
_programs.push_back(program_);
break;
default:
CHECK_RESULT(true, "Atomic type not supported (clCreateProgram)");
}
// Build the programs.
for (size_t i = 0; i < _programs.size(); i++) {
error_ = _wrapper->clBuildProgram(_programs[i], 1, &device, buildOptions,
NULL, NULL);
if (error_ != CL_SUCCESS) {
status = _wrapper->clGetProgramBuildInfo(_programs[i], device,
CL_PROGRAM_BUILD_LOG,
16384 * sizeof(char), log, NULL);
printf("Build error -> %s\n", log);
CHECK_RESULT(0, "clBuildProgram failed");
}
}
switch (atomicType) {
case GlobalWGReduction:
case Global4WGReduction:
kernel_ = _wrapper->clCreateKernel(
_programs[0], "global_atomics_sum_reduction_workgroup", &error_);
CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
_kernels.push_back(kernel_);
break;
case GlobalAllToZeroReduction:
case Global4AllToZeroReduction:
kernel_ = _wrapper->clCreateKernel(
_programs[0], "global_atomics_sum_reduction_all_to_zero", &error_);
CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
_kernels.push_back(kernel_);
break;
default:
CHECK_RESULT(true, "Atomic type not supported (clCreateKernel)");
}
}
// Sets the kernel arguments based on the current test type.
void OCLPerfAtomicSpeed20::SetKernelArguments(const AtomicType atomicType) {
int Arg = 0;
int localSize = 0;
int itemsPerThread = 1;
cl_int status = CL_SUCCESS;
switch (atomicType) {
case GlobalWGReduction:
case Global4WGReduction:
case GlobalAllToZeroReduction:
case Global4AllToZeroReduction:
// Set arguments for the global atomics histogram kernel
if ((Global4WGReduction == atomicType) ||
(Global4AllToZeroReduction == atomicType))
itemsPerThread = 4;
status = _wrapper->clSetKernelArg(
_kernels[0], Arg++, sizeof(itemsPerThread), (void *)&itemsPerThread);
CHECK_RESULT(status, "clSetKernelArg failed. (itemsPerThread)");
status = _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
(void *)&_inputBuffer);
CHECK_RESULT(status, "clSetKernelArg failed. (inputBuffer)");
status |= _wrapper->clSetKernelArg(_kernels[0], Arg++, sizeof(cl_mem),
(void *)&_outputBuffer);
CHECK_RESULT(status, "clSetKernelArg failed. (outputBuffer)");
break;
default:
CHECK_RESULT(true, "Atomic type not supported (clSetKernelArg)");
}
}
// Since we write multiple times to the output in global atomics, need to
// reset the content every time.
void OCLPerfAtomicSpeed20::ResetGlobalOutput() {
cl_int status;
memset(_output, 0, _outputNBytes);
status =
_wrapper->clEnqueueWriteBuffer(cmd_queue_, _outputBuffer, CL_TRUE, 0,
_outputNBytes, _output, 0, NULL, NULL);
CHECK_RESULT(status, "clEnqueueWriteBuffer failed.");
status = _wrapper->clFinish(cmd_queue_);
CHECK_RESULT(status, "clFlush failed.");
}
// Run the global histogram kernel.
void OCLPerfAtomicSpeed20::RunGlobalHistogram(AtomicType atomicType) {
cl_uint status;
size_t globalThreads[3] = {1};
size_t localThreads[3] = {1};
globalThreads[0] = _inputNBytes / sizeof(cl_uint);
localThreads[0] = _nThreadsPerGroup;
if ((Global4WGReduction == atomicType) ||
(Global4AllToZeroReduction == atomicType))
globalThreads[0] /= 4;
status = _wrapper->clEnqueueNDRangeKernel(cmd_queue_, _kernels[0], 1, NULL,
globalThreads, localThreads, 0,
NULL, NULL);
CHECK_RESULT(status, "clEnqueueNDRangeKernel failed.");
status = _wrapper->clFinish(cmd_queue_);
CHECK_RESULT(status, "clFlush failed.");
}
// Run the AtomicSpeed logic.
void OCLPerfAtomicSpeed20::run() {
if (skip_) {
return;
}
#if defined(CL_VERSION_2_0)
int Arg = 0;
cl_uint status;
AtomicType atomicType = testOCLPerfAtomicSpeed20List[_openTest].atomicType;
// Verify atomics are supported.
if ((!_atomicsSupported) || (_dataSizeTooBig)) return;
// Write data to the GPU
status = _wrapper->clEnqueueWriteBuffer(cmd_queue_, _inputBuffer, CL_FALSE, 0,
_inputNBytes, _input, 0, NULL, NULL);
CHECK_RESULT(status, "clEnqueueWriteBuffer failed. (inputBuffer)");
status = _wrapper->clFlush(cmd_queue_);
CHECK_RESULT(status, "clFlush failed.");
// Set the current arguments based on the test type.
SetKernelArguments(atomicType);
// Run the kernels.
CPerfCounter timer;
double totalTime = 0.0f;
for (unsigned int k = 0; k < _numLoops + 1; k++) {
// Since we run multiple times using global atomics the output
// would get accumulated therefore first clean it.
ResetGlobalOutput();
timer.Reset();
timer.Start();
switch (atomicType) {
case GlobalWGReduction:
case Global4WGReduction:
case GlobalAllToZeroReduction:
case Global4AllToZeroReduction:
RunGlobalHistogram(atomicType);
break;
default:
CHECK_RESULT(true, "Atomic type not supported");
}
timer.Stop();
// Don't count the warm-up
if (0 != k) totalTime += timer.GetElapsedTime();
}
status = _wrapper->clEnqueueReadBuffer(cmd_queue_, _outputBuffer, CL_FALSE, 0,
_outputNBytes, _output, 0, NULL, NULL);
CHECK_RESULT(status, "clEnqueueReadBuffer failed.");
status = _wrapper->clFinish(cmd_queue_);
CHECK_RESULT(status, "clFlush failed.");
// Print the results.
PrintResults(atomicType, totalTime);
// Check the results for the current test.
_errorFlag = !(VerifyResults(atomicType));
#endif
}
// Compare the results and see if they match
bool OCLPerfAtomicSpeed20::VerifyResults(const AtomicType atomicType) {
cl_uint i = 0;
bool flag = true;
cl_uint calculatedValue = 0;
cl_uint reductionElementCount = 0;
switch (atomicType) {
case GlobalWGReduction:
case Global4WGReduction:
reductionElementCount =
_inputNBytes / sizeof(cl_uint) / _nThreadsPerGroup;
for (i = 0; i < reductionElementCount; i++) {
calculatedValue += _output[i];
}
flag = (calculatedValue == _cpuReductionSum);
break;
case GlobalAllToZeroReduction:
case Global4AllToZeroReduction:
flag = (_output[0] == _cpuReductionSum);
break;
default:
CHECK_RESULT_NO_RETURN(true, "Atomic type not supported (VerifyResults)");
return false;
}
if (!flag) printf("WRONG VALUES!!!!!");
return flag;
}
unsigned int OCLPerfAtomicSpeed20::close() {
size_t i = 0;
for (; i < _kernels.size(); i++) {
error_ = _wrapper->clReleaseKernel(_kernels[i]);
}
for (; i < _programs.size(); i++) {
error_ = _wrapper->clReleaseProgram(_programs[i]);
}
if (_inputBuffer) {
error_ = clReleaseMemObject(_inputBuffer);
CHECK_RESULT_NO_RETURN(error_, "clReleaseMemObject failed.(inputBuffer )");
}
if (_outputBuffer) {
error_ = clReleaseMemObject(_outputBuffer);
CHECK_RESULT_NO_RETURN(error_, "clReleaseMemObject failed.(outputBuffer)");
}
// Free host memory.
free(_input);
free(_output);
// Reset everything.
_kernels.clear();
_programs.clear();
_inputBuffer = NULL;
_outputBuffer = NULL;
_input = NULL;
_output = NULL;
return OCLTestImp::close();
}
/* Helper functions */
void OCLPerfAtomicSpeed20::calculateHostBin() {
// compute CPU histogram
cl_int *p = (cl_int *)_input;
memset(_cpuhist, 0, NBINS * sizeof(cl_uint));
_cpuReductionSum = 0;
for (unsigned int i = 0; i < _inputNBytes / sizeof(cl_uint); i++) {
_cpuhist[(p[i] >> 24) & 0xff]++;
_cpuhist[(p[i] >> 16) & 0xff]++;
_cpuhist[(p[i] >> 8) & 0xff]++;
_cpuhist[(p[i] >> 0) & 0xff]++;
_cpuReductionSum += ((p[i] >> 24) & 0x3) + ((p[i] >> 16) & 0x3) +
((p[i] >> 8) & 0x3) + ((p[i] >> 0) & 0x3);
}
}
void OCLPerfAtomicSpeed20::setupHistogram() {
cl_int status = 0;
_nThreads = 64 * 1024;
_n4Vectors = 2048 * 2048;
_n4Vectors *= _nCurrentInputScale;
_n4VectorsPerThread = _n4Vectors / _nThreads;
_inputNBytes = _n4Vectors * sizeof(cl_uint4);
_input = (cl_uint *)malloc(_inputNBytes);
if (0 == _input) {
_dataSizeTooBig = true;
return;
}
// random initialization of input
time_t ltime;
time(&ltime);
cl_uint a = (cl_uint)ltime, b = (cl_uint)ltime;
cl_uint *p = (cl_uint *)_input;
for (unsigned int i = 0; i < _inputNBytes / sizeof(cl_uint); i++)
p[i] = (b = (a * (b & 65535)) + (b >> 16));
}
// Print the results of the current test.
void OCLPerfAtomicSpeed20::PrintResults(const AtomicType atomicType,
double totalTime) {
char buf[500];
char sAtomicType[100];
double inputInGB = (double)_inputNBytes * (double)(1e-09);
// each cl_uint in _inputNBytes contributes 4 items.
double totalHistogramDataInGB = (double)inputInGB * 4;
double perf = totalTime / _numLoops;
switch (atomicType) {
case GlobalWGReduction:
SNPRINTF(sAtomicType, sizeof(sAtomicType), "Global work-group reduction");
break;
case Global4WGReduction:
SNPRINTF(sAtomicType, sizeof(sAtomicType),
"Global vec 4 work-group reduction");
break;
case GlobalAllToZeroReduction:
SNPRINTF(sAtomicType, sizeof(sAtomicType),
"Global all to zero reduction");
break;
case Global4AllToZeroReduction:
SNPRINTF(sAtomicType, sizeof(sAtomicType),
"Global vec 4 all to zero reduction");
break;
default:
CHECK_RESULT(true, "Atomic type not supported (PrintResults)");
}
SNPRINTF(buf, sizeof(buf), "%45s: Input [%.3f GB], Time [%.3f sec]: GB/s",
sAtomicType, totalHistogramDataInGB, perf);
_perfInfo = (float)(totalHistogramDataInGB / perf);
testDescString = buf;
}
@@ -0,0 +1,102 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_AtomicSpeed20_H_
#define _OCL_AtomicSpeed20_H_
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "OCLTestImp.h"
#define DEFAULT_WG_SIZE 256
#define NBINS 256
#define BITS_PER_PIX 8
#define NBANKS 16
#include "OCLPerfAtomicSpeed.h"
typedef struct {
AtomicType atomicType;
int inputScale;
} testOCLPerfAtomicSpeed20Struct;
// Define the OCLPerfAtomicSpeed20 class.
class OCLPerfAtomicSpeed20 : public OCLTestImp {
public:
OCLPerfAtomicSpeed20();
virtual ~OCLPerfAtomicSpeed20();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
cl_command_queue cmd_queue_;
std::vector<cl_program> _programs;
std::vector<cl_kernel> _kernels;
bool _atomicsSupported;
bool _dataSizeTooBig;
cl_uint _numLoops;
// Histogram related stuff...
private:
cl_ulong _maxMemoryAllocationSize;
cl_uint _inputNBytes;
cl_uint _outputNBytes;
cl_uint _nCurrentInputScale;
cl_uint _workgroupSize;
// cl_uint nLoops;
cl_uint _nThreads;
cl_uint _nThreadsPerGroup;
cl_uint _nGroups;
cl_uint _n4Vectors;
cl_uint _n4VectorsPerThread;
cl_uint _nBins;
cl_uint _nBytesLDSPerGrp;
cl_uint* _input;
cl_uint* _output;
cl_mem _inputBuffer;
cl_mem _outputBuffer;
bool skip_;
cl_uint _cpuhist[NBINS];
cl_uint _cpuReductionSum;
void calculateHostBin();
void setupHistogram();
bool VerifyResults(const AtomicType atomicType);
void ResetGlobalOutput();
// Methods that does the actual NDRange.
void RunGlobalHistogram(const AtomicType atomicType);
void CreateKernels(const AtomicType atomicType);
void SetKernelArguments(const AtomicType atomicType);
void PrintResults(const AtomicType atomicType, double totalTime);
};
#endif // _OCL_AtomicSpeed20_H_
@@ -0,0 +1,73 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
static const char *global_atomics_sum_reduction_all_to_zero =
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
" __kernel void global_atomics_sum_reduction_all_to_zero(uint "
"ItemsPerThread, __global uint *Input, __global atomic_int *Output )\n"
"{\n"
" uint sum = 0;\n"
" const uint msk = (uint)3;\n"
" const uint shft = (uint)8;\n"
" \n"
" uint tid = get_global_id(0);\n"
" uint Stride = get_global_size(0);\n"
" for( int i = 0; i < ItemsPerThread; i++)\n"
" {\n"
" uint data = Input[tid];\n"
" sum += data & msk;\n"
" data = data >> shft;"
" sum += data & msk;\n"
" data = data >> shft;"
" sum += data & msk;\n"
" data = data >> shft;"
" sum += data & msk;\n"
" tid += Stride;\n"
" }\n"
" atomic_fetch_add_explicit( &(Output[0]), sum, memory_order_relaxed, "
"memory_scope_device);\n"
"}\n";
static const char *global_atomics_sum_reduction_workgroup =
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
" __kernel void global_atomics_sum_reduction_workgroup(uint "
"ItemsPerThread, __global uint *Input, __global atomic_int *Output )\n"
"{\n"
" uint sum = 0;\n"
" const uint msk = (uint)3;\n"
" const uint shft = (uint)8;\n"
" \n"
" uint tid = get_global_id(0);\n"
" uint Stride = get_global_size(0);\n"
" for( int i = 0; i < ItemsPerThread; i++)\n"
" {\n"
" uint data = Input[tid];\n"
" sum += data & msk;\n"
" data = data >> shft;"
" sum += data & msk;\n"
" data = data >> shft;"
" sum += data & msk;\n"
" data = data >> shft;"
" sum += data & msk;\n"
" tid += Stride;\n"
" }\n"
" atomic_fetch_add_explicit( &(Output[get_group_id(0)]), sum, "
"memory_order_relaxed, memory_scope_device);\n"
"}\n";
@@ -0,0 +1,402 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
static const char *local_atomics_histogram =
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#define MIN(a,b) ((a) < (b)) ? (a) : (b) \n"
"#define MAX(a,b) ((a) > (b)) ? (a) : (b) \n"
"__kernel __attribute__((reqd_work_group_size(256,1,1)))\n"
"void local_atomics_histogram(__global uint4 *Image,\n"
"__global uint *Histogram,\n"
"uint n4VectorsPerThread)\n"
"{\n"
" __local __attribute__((aligned(16))) uint subhists[NBANKS * NBINS];\n"
"\n"
" uint tid = get_global_id(0);\n"
" uint ltid = get_local_id(0);\n"
" uint Stride = get_global_size(0);\n"
"\n"
" uint i, idx;\n"
" uint4 temp, temp2;\n"
" const uint shft = (uint) BITS_PER_PIX;\n"
" const uint msk = (uint) (NBINS-1);\n"
" uint offset = (uint) ltid % (uint) (NBANKS);\n"
"\n"
" uint lmem_items = NBANKS * NBINS;\n"
" uint lmem_items_per_thread;\n"
" uint lmem_max_threads;\n"
"\n"
" // parallel LDS clear\n"
" // first, calculate threads per item, at least 1:\n"
" lmem_max_threads = MIN( 1, get_local_size(0) / lmem_items );\n"
" // but no more than we have items:\n"
" lmem_max_threads = MAX( 1, lmem_max_threads / lmem_items );\n"
" // calculate threads total:\n"
" lmem_max_threads = lmem_items / lmem_max_threads;\n"
" // but no more than LDS banks:\n"
" lmem_max_threads = MIN( get_local_size(0), lmem_max_threads );\n"
"\n"
" lmem_items_per_thread = lmem_items / lmem_max_threads;\n"
"\n"
" // now, clear LDS\n"
" __local uint4 *p = (__local uint4 *) subhists;\n"
"\n"
" if( ltid < lmem_max_threads )\n"
" {\n"
" for(i=0, idx=ltid; i<lmem_items_per_thread/4; i++, "
"idx+=lmem_max_threads)\n"
" {\n"
" p[idx] = 0;\n"
" }\n"
" }\n"
"\n"
" barrier( CLK_LOCAL_MEM_FENCE );\n"
"\n"
" // read & scatter phase\n"
"\n"
" for( i=0, idx=tid; i<n4VectorsPerThread; i++, idx += Stride )\n"
" {\n"
" temp = Image[idx];\n"
" temp2 = (temp & msk) * (uint4) NBANKS + offset;\n"
"\n"
" (void) atom_inc( subhists + temp2.x );\n"
" (void) atom_inc( subhists + temp2.y );\n"
" (void) atom_inc( subhists + temp2.z );\n"
" (void) atom_inc( subhists + temp2.w );\n"
"\n"
" temp = temp >> shft;\n"
" temp2 = (temp & msk) * (uint4) NBANKS + offset;\n"
"\n"
" (void) atom_inc( subhists + temp2.x );\n"
" (void) atom_inc( subhists + temp2.y );\n"
" (void) atom_inc( subhists + temp2.z );\n"
" (void) atom_inc( subhists + temp2.w );\n"
"\n"
" temp = temp >> shft;\n"
" temp2 = (temp & msk) * (uint4) NBANKS + offset;\n"
"\n"
" (void) atom_inc( subhists + temp2.x );\n"
" (void) atom_inc( subhists + temp2.y );\n"
" (void) atom_inc( subhists + temp2.z );\n"
" (void) atom_inc( subhists + temp2.w );\n"
"\n"
" temp = temp >> shft;\n"
" temp2 = (temp & msk) * (uint4) NBANKS + offset;\n"
"\n"
" (void) atom_inc( subhists + temp2.x );\n"
" (void) atom_inc( subhists + temp2.y );\n"
" (void) atom_inc( subhists + temp2.z );\n"
" (void) atom_inc( subhists + temp2.w );\n"
" }\n"
"\n"
" barrier( CLK_LOCAL_MEM_FENCE );\n"
"\n"
" // reduce __local banks to single histogram per work-group\n"
"\n"
" if( ltid < NBINS )\n"
" {\n"
" uint bin = 0;\n"
" for( i=0; i<NBANKS; i++ )\n"
" {\n"
" bin += subhists[ (ltid * NBANKS) + i ];\n"
" }\n"
" Histogram[ (get_group_id(0) * NBINS) + ltid ] = bin;\n"
" }\n"
"}\n";
static const char *local_atomics_reduce =
" __kernel void local_atomics_reduce( __global uint *Histogram, uint "
"nSubHists )\n"
"{\n"
" uint tid = get_global_id(0);\n"
" uint bin = 0;\n"
" // Reduce work-group histograms into single histogram,\n"
" // one thread for each bin.\n"
" for( int i=0; i < nSubHists; i++ )\n"
" bin += Histogram[ (i * NBINS) + tid ];\n"
" Histogram[ tid ] = bin;\n"
"}\n";
static const char *global_atomics_histogram =
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"__kernel __attribute__((reqd_work_group_size(256,1,1)))\n"
"void global_atomics_histogram(uint ItemsPerThread,\n"
"__global uint *Input,\n"
"__global uint *Histogram)\n"
"{\n"
" uint tid = get_global_id(0);\n"
" const uint shft = (uint) BITS_PER_PIX;\n"
" const uint msk = (uint) (NBINS-1);\n"
" uint Stride = get_global_size(0);\n"
" for( int i = 0; i < ItemsPerThread; i++)\n"
" {\n"
" uint temp = Input[tid];\n"
" atom_inc( &(Histogram[ (temp & msk) ]) );\n"
" temp = temp >> shft;\n"
" atom_inc( &(Histogram[ (temp & msk) ]) );\n"
" temp = temp >> shft;\n"
" atom_inc( &(Histogram[ (temp & msk) ]) );\n"
" temp = temp >> shft;\n"
" atom_inc( &(Histogram[ (temp & msk) ]) );\n"
" tid += Stride;"
" }\n"
"}\n";
static const char *global_vec4_atomics_histogram =
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"__kernel __attribute__((reqd_work_group_size(256,1,1)))\n"
"void global_atomics_histogram(uint ItemsPerThread,\n"
"__global uint4 *Input,\n"
"__global uint *Histogram)\n"
"{\n"
" uint tid = get_global_id(0);\n"
" const uint shft = (uint) BITS_PER_PIX;\n"
" const uint msk = (uint) (NBINS-1);\n"
" uint Stride = get_global_size(0);\n"
" for( int i = 0; i < ItemsPerThread; i++)\n"
" {\n"
" uint4 temp = Input[tid];\n"
" atom_inc( &(Histogram[ (temp.x & msk) ]) );\n"
" atom_inc( &(Histogram[ (temp.y & msk) ]) );\n"
" atom_inc( &(Histogram[ (temp.z & msk) ]) );\n"
" atom_inc( &(Histogram[ (temp.w & msk) ]) );\n"
" temp = temp >> shft;\n"
" atom_inc( &(Histogram[ (temp.x & msk) ]) );\n"
" atom_inc( &(Histogram[ (temp.y & msk) ]) );\n"
" atom_inc( &(Histogram[ (temp.z & msk) ]) );\n"
" atom_inc( &(Histogram[ (temp.w & msk) ]) );\n"
" temp = temp >> shft;\n"
" atom_inc( &(Histogram[ (temp.x & msk) ]) );\n"
" atom_inc( &(Histogram[ (temp.y & msk) ]) );\n"
" atom_inc( &(Histogram[ (temp.z & msk) ]) );\n"
" atom_inc( &(Histogram[ (temp.w & msk) ]) );\n"
" temp = temp >> shft;\n"
" atom_inc( &(Histogram[ (temp.x & msk) ]) );\n"
" atom_inc( &(Histogram[ (temp.y & msk) ]) );\n"
" atom_inc( &(Histogram[ (temp.z & msk) ]) );\n"
" atom_inc( &(Histogram[ (temp.w & msk) ]) );\n"
" tid += Stride;"
" }\n"
"}\n";
static const char *global_atomics_sum_reduction_all_to_zero =
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
" __kernel void global_atomics_sum_reduction_all_to_zero(uint "
"ItemsPerThread, __global uint *Input, __global int *Output )\n"
"{\n"
" uint sum = 0;\n"
" const uint msk = (uint)3;\n"
" const uint shft = (uint)8;\n"
" \n"
" uint tid = get_global_id(0);\n"
" uint Stride = get_global_size(0);\n"
" for( int i = 0; i < ItemsPerThread; i++)\n"
" {\n"
" uint data = Input[tid];\n"
" sum += data & msk;\n"
" data = data >> shft;"
" sum += data & msk;\n"
" data = data >> shft;"
" sum += data & msk;\n"
" data = data >> shft;"
" sum += data & msk;\n"
" tid += Stride;\n"
" }\n"
" atom_add( &(Output[0]), sum);\n"
"}\n";
static const char *global_atomics_sum_reduction_workgroup =
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
" __kernel void global_atomics_sum_reduction_workgroup(uint "
"ItemsPerThread, __global uint *Input, __global int *Output )\n"
"{\n"
" uint sum = 0;\n"
" const uint msk = (uint)3;\n"
" const uint shft = (uint)8;\n"
" \n"
" uint tid = get_global_id(0);\n"
" uint Stride = get_global_size(0);\n"
" for( int i = 0; i < ItemsPerThread; i++)\n"
" {\n"
" uint data = Input[tid];\n"
" sum += data & msk;\n"
" data = data >> shft;"
" sum += data & msk;\n"
" data = data >> shft;"
" sum += data & msk;\n"
" data = data >> shft;"
" sum += data & msk;\n"
" tid += Stride;\n"
" }\n"
" atom_add( &(Output[get_group_id(0)]), sum);\n"
"}\n";
static const char *local_reduction =
"__kernel void local_reduction(__global uint* input, __global uint* "
"output, __local uint* sdata)\n"
"{\n"
" // load shared mem\n"
" const uint msk = (uint)3;\n"
" const uint shft = (uint)8;\n"
" unsigned int tid = get_local_id(0);\n"
"\n"
" unsigned int localSize = get_local_size(0);\n"
" unsigned int stride = get_global_id(0) * 2;\n"
" unsigned int data1 = input[stride];\n"
" unsigned int data2 = input[stride + 1];\n"
" unsigned int sum = 0;\n"
" for( int i = 0; i < 4; i++)\n"
" {\n"
" sum += (data1 & msk) + (data2 & msk);\n"
" data1 = data1 >> shft;\n"
" data2 = data2 >> shft;\n"
" }\n"
" sdata[tid] = sum;"
"\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" // do reduction in shared mem\n"
" for(unsigned int s = localSize >> 1; s > 0; s >>= 1)\n"
" {\n"
" if(tid < s) \n"
" {\n"
" sdata[tid] += sdata[tid + s];\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
"\n"
" // write result for this block to global mem\n"
" if(tid == 0) output[get_group_id(0)] = sdata[0];\n"
"}\n";
static const char *local_vec4_reduction =
"__kernel void local_reduction(__global uint4* input, __global uint4* "
"output, __local uint4* sdata)\n"
"{\n"
" // load shared mem\n"
" const uint msk = (uint)3;\n"
" const uint shft = (uint)8;\n"
" unsigned int tid = get_local_id(0);\n"
"\n"
" unsigned int localSize = get_local_size(0);\n"
" unsigned int stride = get_global_id(0) * 2;\n"
" uint4 data1 = input[stride];\n"
" uint4 data2 = input[stride + 1];\n"
" uint4 sum = 0;\n"
" for( int i = 0; i < 4; i++)\n"
" {\n"
" sum += (data1 & msk) + (data2 & msk);\n"
" data1 = data1 >> shft;\n"
" data2 = data2 >> shft;\n"
" }\n"
" sdata[tid] = sum;"
"\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" // do reduction in shared mem\n"
" for(unsigned int s = localSize >> 1; s > 0; s >>= 1)\n"
" {\n"
" if(tid < s) \n"
" {\n"
" sdata[tid] += sdata[tid + s];\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
"\n"
" // write result for this block to global mem\n"
" if(tid == 0) output[get_group_id(0)] = sdata[0];\n"
"}\n";
static const char *local_atomics_reduction =
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"__kernel void local_reduction(__global uint* input, __global uint* "
"output, __local uint* sdata)\n"
"{\n"
" // load shared mem\n"
" const uint msk = (uint)3;\n"
" const uint shft = (uint)8;\n"
" unsigned int tid = get_local_id(0);\n"
"\n"
" unsigned int localSize = get_local_size(0);\n"
" unsigned int stride = get_global_id(0) * 2;\n"
" unsigned int data1 = input[stride];\n"
" unsigned int data2 = input[stride + 1];\n"
" unsigned int sum = 0;\n"
" for( int i = 0; i < 4; i++)\n"
" {\n"
" sum += (data1 & msk) + (data2 & msk);\n"
" data1 = data1 >> shft;\n"
" data2 = data2 >> shft;\n"
" }\n"
" sdata[tid] = sum;"
"\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" // do reduction in shared mem\n"
" for(unsigned int s = localSize >> 1; s > 0; s >>= 1)\n"
" {\n"
" if(tid < s) \n"
" {\n"
" atom_add( &(sdata[tid]), sdata[tid + s]);\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
"\n"
" // write result for this block to global mem\n"
" if(tid == 0) output[get_group_id(0)] = sdata[0];\n"
"}\n";
static const char *local_vec4_atomics_reduction =
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"__kernel void local_reduction(__global uint4* input, __global uint4* "
"output, __local uint4* sdata)\n"
"{\n"
" // load shared mem\n"
" const uint msk = (uint)3;\n"
" const uint shft = (uint)8;\n"
" unsigned int tid = get_local_id(0);\n"
"\n"
" unsigned int localSize = get_local_size(0);\n"
" unsigned int stride = get_global_id(0) * 2;\n"
" uint4 data1 = input[stride];\n"
" uint4 data2 = input[stride + 1];\n"
" uint4 sum = 0;\n"
" for( int i = 0; i < 4; i++)\n"
" {\n"
" sum += (data1 & msk) + (data2 & msk);\n"
" data1 = data1 >> shft;\n"
" data2 = data2 >> shft;\n"
" }\n"
" sdata[tid] = sum;"
"\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" // do reduction in shared mem\n"
" for(unsigned int s = localSize >> 1; s > 0; s >>= 1)\n"
" {\n"
" if(tid < s) \n"
" {\n"
" atom_add( &(sdata[tid]).x, sdata[tid + s].x);\n"
" atom_add( &(sdata[tid]).y, sdata[tid + s].y);\n"
" atom_add( &(sdata[tid]).z, sdata[tid + s].z);\n"
" atom_add( &(sdata[tid]).w, sdata[tid + s].w);\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
"\n"
" // write result for this block to global mem\n"
" if(tid == 0) output[get_group_id(0)] = sdata[0];\n"
"}\n";
@@ -0,0 +1,254 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerfBufferCopyOverhead.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <complex>
#include "CL/opencl.h"
#include "Timer.h"
// Quiet pesky warnings
#ifdef WIN_OS
#define SNPRINTF sprintf_s
#else
#define SNPRINTF snprintf
#endif
typedef struct {
unsigned int iterations;
int flushEvery;
} testStruct;
static testStruct testList[] = {
{1, -1}, {1, -1}, {10, 1}, {10, -1}, {100, 1},
{100, 10}, {100, -1}, {1000, 1}, {1000, 10}, {1000, 100},
{1000, -1}, {10000, 1}, {10000, 10}, {10000, 100}, {10000, 1000},
{10000, -1}, {100000, 1}, {100000, 10}, {100000, 100}, {100000, 1000},
{100000, 10000}, {100000, -1},
};
OCLPerfBufferCopyOverhead::OCLPerfBufferCopyOverhead() {
_numSubTests = 2 * 2 * sizeof(testList) / sizeof(testStruct);
}
OCLPerfBufferCopyOverhead::~OCLPerfBufferCopyOverhead() {}
static void CL_CALLBACK notify_callback(const char *errinfo,
const void *private_info, size_t cb,
void *user_data) {}
void OCLPerfBufferCopyOverhead::open(unsigned int test, char *units,
double &conversion,
unsigned int deviceId) {
cl_uint numPlatforms;
cl_platform_id platform = NULL;
cl_uint num_devices = 0;
cl_device_id *devices = NULL;
cl_device_id device = NULL;
_crcword = 0;
conversion = 1.0f;
_deviceId = deviceId;
_openTest = test % (sizeof(testList) / sizeof(testStruct));
context_ = 0;
cmd_queue_ = 0;
srcBuffer_ = 0;
dstBuffer_ = 0;
error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
if (0 < numPlatforms) {
cl_platform_id *platforms = new cl_platform_id[numPlatforms];
error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
platform = platforms[_platformIndex];
char pbuf[100];
error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
NULL);
num_devices = 0;
/* Get the number of requested devices */
error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
&num_devices);
// Runtime returns an error when no GPU devices are present instead of just
// returning 0 devices
// CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
// Choose platform with GPU devices
delete platforms;
}
bufSize_ = 4;
/*
* If we could find our platform, use it. If not, die as we need the AMD
* platform for these extensions.
*/
CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
CHECK_RESULT(devices == 0, "no devices");
/* Get the requested device */
error_ =
_wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
device = devices[_deviceId];
context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
&error_);
CHECK_RESULT(context_ == 0, "clCreateContext failed");
cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
cl_mem_flags flags = CL_MEM_READ_ONLY;
sleep = ((test / (sizeof(testList) / sizeof(testStruct))) % 2) > 0;
if (test >= ((sizeof(testList) / sizeof(testStruct)) * 2)) {
srcHost = true;
flags |= CL_MEM_ALLOC_HOST_PTR;
} else {
srcHost = false;
}
srcBuffer_ =
_wrapper->clCreateBuffer(context_, flags, bufSize_, NULL, &error_);
CHECK_RESULT(srcBuffer_ == 0, "clCreateBuffer(srcBuffer) failed");
flags = CL_MEM_WRITE_ONLY;
if (!srcHost) {
flags |= CL_MEM_ALLOC_HOST_PTR;
}
dstBuffer_ =
_wrapper->clCreateBuffer(context_, flags, bufSize_, NULL, &error_);
CHECK_RESULT(dstBuffer_ == 0, "clCreateBuffer(dstBuffer) failed");
}
void OCLPerfBufferCopyOverhead::run(void) {
CPerfCounter timer;
cl_event event;
cl_int eventStatus;
unsigned int iter = testList[_openTest].iterations;
// Warm up
error_ = _wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, dstBuffer_, 0,
0, bufSize_, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueCopyBuffer failed");
error_ = _wrapper->clFinish(cmd_queue_);
CHECK_RESULT(error_, "clFinish failed");
timer.Reset();
timer.Start();
for (unsigned int i = 0; i < iter; i++) {
error_ = _wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, dstBuffer_,
0, 0, bufSize_, 0, NULL, &event);
CHECK_RESULT(error_, "clEnqueueCopyBuffer failed");
if ((testList[_openTest].flushEvery > 0) &&
(((i + 1) % testList[_openTest].flushEvery) == 0)) {
if (sleep) {
_wrapper->clFinish(cmd_queue_);
} else {
_wrapper->clFlush(cmd_queue_);
error_ =
_wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
sizeof(cl_int), &eventStatus, NULL);
while (eventStatus > 0) {
error_ =
_wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
sizeof(cl_int), &eventStatus, NULL);
}
}
}
if (i != (iter - 1)) {
_wrapper->clReleaseEvent(event);
}
}
if (sleep) {
_wrapper->clFinish(cmd_queue_);
} else {
_wrapper->clFlush(cmd_queue_);
error_ = _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
sizeof(cl_int), &eventStatus, NULL);
while (eventStatus > 0) {
error_ =
_wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
sizeof(cl_int), &eventStatus, NULL);
}
}
_wrapper->clReleaseEvent(event);
timer.Stop();
double sec = timer.GetElapsedTime();
// Buffer copy time in us
double perf = sec * 1000. * 1000. / iter;
const char *strSrc = NULL;
const char *strDst = NULL;
const char *strWait = NULL;
if (srcHost) {
strSrc = "host";
strDst = "dev";
} else {
strSrc = "dev";
strDst = "host";
}
if (sleep) {
strWait = "sleep";
} else {
strWait = "spin";
}
_perfInfo = (float)perf;
char buf[256];
SNPRINTF(buf, sizeof(buf), " %5s, s:%4s d:%4s i:%6d (us) ", strWait, strSrc,
strDst, iter);
testDescString = buf;
}
unsigned int OCLPerfBufferCopyOverhead::close(void) {
if (srcBuffer_) {
error_ = _wrapper->clReleaseMemObject(srcBuffer_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseMemObject(srcBuffer_) failed");
}
if (dstBuffer_) {
error_ = _wrapper->clReleaseMemObject(dstBuffer_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseMemObject(dstBuffer_) failed");
}
if (cmd_queue_) {
error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseCommandQueue failed");
}
if (context_) {
error_ = _wrapper->clReleaseContext(context_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
}
return _crcword;
}
@@ -0,0 +1,50 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_BufferCopyOverhead_H_
#define _OCL_BufferCopyOverhead_H_
#include "OCLTestImp.h"
class OCLPerfBufferCopyOverhead : public OCLTestImp {
public:
OCLPerfBufferCopyOverhead();
virtual ~OCLPerfBufferCopyOverhead();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
static const unsigned int NUM_ITER = 1000;
cl_context context_;
cl_command_queue cmd_queue_;
cl_mem srcBuffer_;
cl_mem dstBuffer_;
cl_int error_;
unsigned int bufSize_;
bool sleep;
bool srcHost;
};
#endif // _OCL_BufferCopyOverhead_H_
@@ -0,0 +1,439 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerfBufferCopySpeed.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <complex>
#include "CL/opencl.h"
#include "Timer.h"
// Quiet pesky warnings
#ifdef WIN_OS
#define SNPRINTF sprintf_s
#else
#define SNPRINTF snprintf
#endif
#define NUM_SIZES 8
// 4KB, 8KB, 64KB, 256KB, 1 MB, 4MB, 16 MB, 16MB+10
static const unsigned int Sizes[NUM_SIZES] = {
4096, 8192, 65536, 262144, 1048576, 4194304, 16777216, 16777216 + 10};
static const unsigned int Iterations[2] = {1, OCLPerfBufferCopySpeed::NUM_ITER};
#define BUF_TYPES 4
// 16 ways to combine 4 different buffer types
#define NUM_SUBTESTS (BUF_TYPES * BUF_TYPES)
OCLPerfBufferCopySpeed::OCLPerfBufferCopySpeed() {
_numSubTests = NUM_SIZES * NUM_SUBTESTS * 2;
}
OCLPerfBufferCopySpeed::~OCLPerfBufferCopySpeed() {}
static void CL_CALLBACK notify_callback(const char *errinfo,
const void *private_info, size_t cb,
void *user_data) {}
void OCLPerfBufferCopySpeed::setData(void *ptr, unsigned int size,
unsigned int value) {
unsigned int *ptr2 = (unsigned int *)ptr;
value = 0;
for (unsigned int i = 0; i < size >> 2; i++) {
ptr2[i] = value;
value++;
}
}
void OCLPerfBufferCopySpeed::checkData(void *ptr, unsigned int size,
unsigned int value) {
unsigned int *ptr2 = (unsigned int *)ptr;
value = 0;
for (unsigned int i = 0; i < size >> 2; i++) {
if (ptr2[i] != value) {
printf("Data validation failed at %d! Got 0x%08x 0x%08x 0x%08x 0x%08x\n",
i, ptr2[i], ptr2[i + 1], ptr2[i + 2], ptr2[i + 3]);
printf("Expected 0x%08x 0x%08x 0x%08x 0x%08x\n", value, value, value,
value);
CHECK_RESULT(true, "Data validation failed!");
break;
}
value++;
}
}
void OCLPerfBufferCopySpeed::open(unsigned int test, char *units,
double &conversion, unsigned int deviceId) {
cl_uint numPlatforms;
cl_platform_id platform = NULL;
cl_uint num_devices = 0;
cl_device_id *devices = NULL;
cl_device_id device = NULL;
_crcword = 0;
conversion = 1.0f;
_deviceId = deviceId;
_openTest = test;
context_ = 0;
cmd_queue_ = 0;
srcBuffer_ = 0;
dstBuffer_ = 0;
persistent[0] = false;
persistent[1] = false;
allocHostPtr[0] = false;
allocHostPtr[1] = false;
useHostPtr[0] = false;
useHostPtr[1] = false;
memptr[0] = NULL;
memptr[1] = NULL;
alignedmemptr[0] = NULL;
alignedmemptr[1] = NULL;
isAMD = false;
error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
if (0 < numPlatforms) {
cl_platform_id *platforms = new cl_platform_id[numPlatforms];
error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
#if 0
// Get last for default
platform = platforms[numPlatforms-1];
for (unsigned i = 0; i < numPlatforms; ++i) {
#endif
platform = platforms[_platformIndex];
char pbuf[100];
error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
NULL);
num_devices = 0;
/* Get the number of requested devices */
error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
&num_devices);
// Runtime returns an error when no GPU devices are present instead of just
// returning 0 devices
// CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
// Choose platform with GPU devices
if (num_devices > 0) {
if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
isAMD = true;
}
// platform = platforms[_platformIndex];
// break;
}
#if 0
}
#endif
delete platforms;
}
char getVersion[128];
error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION,
sizeof(getVersion), getVersion, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
platformVersion[0] = getVersion[7];
platformVersion[1] = getVersion[8];
platformVersion[2] = getVersion[9];
platformVersion[3] = '\0';
bufSize_ = Sizes[_openTest % NUM_SIZES];
unsigned int srcTest = (_openTest / NUM_SIZES) % BUF_TYPES;
unsigned int dstTest = (_openTest / (NUM_SIZES * BUF_TYPES)) % BUF_TYPES;
if (srcTest == 3) {
useHostPtr[0] = true;
} else if ((srcTest == 2) && isAMD) {
persistent[0] = true;
} else if (srcTest == 1) {
allocHostPtr[0] = true;
}
if ((dstTest == 1) && isAMD) {
persistent[1] = true;
} else if (dstTest == 2) {
allocHostPtr[1] = true;
} else if (dstTest == 3) {
useHostPtr[1] = true;
}
numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS)];
/*
* If we could find our platform, use it. If not, die as we need the AMD
* platform for these extensions.
*/
CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
CHECK_RESULT(devices == 0, "no devices");
/* Get the requested device */
error_ =
_wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
device = devices[_deviceId];
context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
&error_);
CHECK_RESULT(context_ == 0, "clCreateContext failed");
cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
cl_mem_flags flags = CL_MEM_READ_ONLY;
if (persistent[0]) {
flags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
} else if (allocHostPtr[0]) {
flags |= CL_MEM_ALLOC_HOST_PTR;
} else if (useHostPtr[0]) {
flags |= CL_MEM_USE_HOST_PTR;
memptr[0] = malloc(bufSize_ + 4096);
alignedmemptr[0] = (void *)(((size_t)memptr[0] + 4095) & ~4095);
}
srcBuffer_ = _wrapper->clCreateBuffer(context_, flags, bufSize_,
alignedmemptr[0], &error_);
CHECK_RESULT(srcBuffer_ == 0, "clCreateBuffer(srcBuffer) failed");
void *mem;
mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, srcBuffer_, CL_TRUE,
CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL,
&error_);
CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
setData(mem, bufSize_, 0x600df00d);
_wrapper->clEnqueueUnmapMemObject(cmd_queue_, srcBuffer_, mem, 0, NULL, NULL);
flags = CL_MEM_WRITE_ONLY;
if (persistent[1]) {
flags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
} else if (allocHostPtr[1]) {
flags |= CL_MEM_ALLOC_HOST_PTR;
} else if (useHostPtr[1]) {
flags |= CL_MEM_USE_HOST_PTR;
memptr[1] = malloc(bufSize_ + 4096);
alignedmemptr[1] = (void *)(((size_t)memptr[1] + 4095) & ~4095);
}
dstBuffer_ = _wrapper->clCreateBuffer(context_, flags, bufSize_,
alignedmemptr[1], &error_);
CHECK_RESULT(dstBuffer_ == 0, "clCreateBuffer(dstBuffer) failed");
// Force persistent memory to be on GPU
if (persistent[0]) {
cl_mem memBuffer =
_wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed");
_wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, dstBuffer_, 0, 0,
bufSize_, 0, NULL, NULL);
_wrapper->clFinish(cmd_queue_);
_wrapper->clReleaseMemObject(memBuffer);
}
if (persistent[1]) {
cl_mem memBuffer =
_wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed");
_wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, memBuffer, 0, 0,
bufSize_, 0, NULL, NULL);
_wrapper->clFinish(cmd_queue_);
_wrapper->clReleaseMemObject(memBuffer);
}
}
void OCLPerfBufferCopySpeed::run(void) {
CPerfCounter timer;
// Warm up
error_ = _wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, dstBuffer_, 0,
0, bufSize_, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueCopyBuffer failed");
error_ = _wrapper->clFinish(cmd_queue_);
CHECK_RESULT(error_, "clFinish failed");
timer.Reset();
timer.Start();
for (unsigned int i = 0; i < numIter; i++) {
error_ = _wrapper->clEnqueueCopyBuffer(cmd_queue_, srcBuffer_, dstBuffer_,
0, 0, bufSize_, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueCopyBuffer failed");
}
error_ = _wrapper->clFinish(cmd_queue_);
CHECK_RESULT(error_, "clFinish failed");
timer.Stop();
double sec = timer.GetElapsedTime();
// Buffer copy bandwidth in GB/s
double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
void *mem;
mem =
_wrapper->clEnqueueMapBuffer(cmd_queue_, dstBuffer_, CL_TRUE, CL_MAP_READ,
0, bufSize_, 0, NULL, NULL, &error_);
CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
checkData(mem, bufSize_, 0x600df00d);
_wrapper->clEnqueueUnmapMemObject(cmd_queue_, dstBuffer_, mem, 0, NULL, NULL);
const char *strSrc = NULL;
const char *strDst = NULL;
if (persistent[0])
strSrc = "per";
else if (allocHostPtr[0])
strSrc = "AHP";
else if (useHostPtr[0])
strSrc = "UHP";
else
strSrc = "dev";
if (persistent[1])
strDst = "per";
else if (allocHostPtr[1])
strDst = "AHP";
else if (useHostPtr[1])
strDst = "UHP";
else
strDst = "dev";
// Double results when src and dst are both on device
if ((persistent[0] || (!allocHostPtr[0] && !useHostPtr[0])) &&
(persistent[1] || (!allocHostPtr[1] && !useHostPtr[1])))
perf *= 2.0;
// Double results when src and dst are both in sysmem
if ((allocHostPtr[0] || useHostPtr[0]) && (allocHostPtr[1] || useHostPtr[1]))
perf *= 2.0;
_perfInfo = (float)perf;
char buf[256];
SNPRINTF(buf, sizeof(buf), " (%8d bytes) s:%s d:%s i:%4d (GB/s) ", bufSize_,
strSrc, strDst, numIter);
testDescString = buf;
}
unsigned int OCLPerfBufferCopySpeed::close(void) {
if (srcBuffer_) {
error_ = _wrapper->clReleaseMemObject(srcBuffer_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseMemObject(srcBuffer_) failed");
}
if (dstBuffer_) {
error_ = _wrapper->clReleaseMemObject(dstBuffer_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseMemObject(dstBuffer_) failed");
}
if (memptr[0]) {
free(memptr[0]);
}
if (memptr[1]) {
free(memptr[1]);
}
if (cmd_queue_) {
error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseCommandQueue failed");
}
if (context_) {
error_ = _wrapper->clReleaseContext(context_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
}
return _crcword;
}
void OCLPerfBufferCopyRectSpeed::run(void) {
CPerfCounter timer;
size_t width = static_cast<size_t>(sqrt(static_cast<float>(bufSize_)));
size_t srcOrigin[3] = {0, 0, 0};
size_t dstOrigin[3] = {0, 0, 0};
size_t region[3] = {width, width, 1};
// Clamp iteration count for non-local writes to shorten test runtime
unsigned int testNumIter = numIter;
if (allocHostPtr[1]) {
testNumIter = (numIter < 100 ? numIter : 100);
}
// Skip for 1.0 platforms
if ((platformVersion[0] == '1') && (platformVersion[2] == '0')) {
char buf[256];
SNPRINTF(buf, sizeof(buf), " SKIPPED ");
testDescString = buf;
return;
}
// Warm up
error_ = _wrapper->clEnqueueCopyBufferRect(cmd_queue_, srcBuffer_, dstBuffer_,
srcOrigin, dstOrigin, region,
width, 0, width, 0, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueCopyBufferRect failed");
error_ = _wrapper->clFinish(cmd_queue_);
CHECK_RESULT(error_, "clFinish failed");
timer.Reset();
timer.Start();
for (unsigned int i = 0; i < testNumIter; i++) {
error_ = _wrapper->clEnqueueCopyBufferRect(
cmd_queue_, srcBuffer_, dstBuffer_, srcOrigin, dstOrigin, region, width,
0, width, 0, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueCopyBufferRect failed");
}
error_ = _wrapper->clFinish(cmd_queue_);
CHECK_RESULT(error_, "clFinish failed");
timer.Stop();
double sec = timer.GetElapsedTime();
// Buffer copy bandwidth in GB/s
double perf = ((double)bufSize_ * testNumIter * (double)(1e-09)) / sec;
const char *strSrc = NULL;
const char *strDst = NULL;
if (persistent[0])
strSrc = "per";
else if (allocHostPtr[0])
strSrc = "AHP";
else if (useHostPtr[0])
strSrc = "UHP";
else
strSrc = "dev";
if (persistent[1])
strDst = "per";
else if (allocHostPtr[1])
strDst = "AHP";
else if (useHostPtr[1])
strDst = "UHP";
else
strDst = "dev";
// Double results when src and dst are both on device
if ((persistent[0] || (!allocHostPtr[0] && !useHostPtr[0])) &&
(persistent[1] || (!allocHostPtr[1] && !useHostPtr[1])))
perf *= 2.0;
// Double results when src and dst are both in sysmem
if ((allocHostPtr[0] || useHostPtr[0]) && (allocHostPtr[1] || useHostPtr[1]))
perf *= 2.0;
_perfInfo = (float)perf;
char buf[256];
SNPRINTF(buf, sizeof(buf), " (%8d bytes) s:%s d:%s i:%4d (GB/s) ", bufSize_,
strSrc, strDst, testNumIter);
testDescString = buf;
}
@@ -0,0 +1,65 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_BufferCopySpeed_H_
#define _OCL_BufferCopySpeed_H_
#include "OCLTestImp.h"
class OCLPerfBufferCopySpeed : public OCLTestImp {
public:
OCLPerfBufferCopySpeed();
virtual ~OCLPerfBufferCopySpeed();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
static const unsigned int NUM_ITER = 1000;
cl_context context_;
cl_command_queue cmd_queue_;
cl_mem srcBuffer_;
cl_mem dstBuffer_;
cl_int error_;
unsigned int bufSize_;
bool persistent[2];
bool allocHostPtr[2];
bool useHostPtr[2];
unsigned int numIter;
bool isAMD;
char platformVersion[32];
void setData(void* ptr, unsigned int size, unsigned int value);
void checkData(void* ptr, unsigned int size, unsigned int value);
void* memptr[2];
void* alignedmemptr[2];
};
class OCLPerfBufferCopyRectSpeed : public OCLPerfBufferCopySpeed {
public:
OCLPerfBufferCopyRectSpeed() : OCLPerfBufferCopySpeed() {}
public:
virtual void run(void);
};
#endif // _OCL_BufferCopySpeed_H_
@@ -0,0 +1,334 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerfBufferReadSpeed.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <complex>
#include "CL/opencl.h"
#include "Timer.h"
// Quiet pesky warnings
#ifdef WIN_OS
#define SNPRINTF sprintf_s
#else
#define SNPRINTF snprintf
#endif
#define NUM_SIZES 8
// 256KB, 1 MB, 4MB, 16 MB
static const unsigned int Sizes[NUM_SIZES] = {
1024, 32 * 1024, 64 * 1024, 128 * 1024, 262144, 1048576, 4194304, 16777216};
static cl_uint blockedSubtests;
static const unsigned int Iterations[2] = {1, OCLPerfBufferReadSpeed::NUM_ITER};
#define NUM_OFFSETS 1
static const unsigned int offsets[NUM_OFFSETS] = {0};
#define NUM_SUBTESTS (3 + NUM_OFFSETS)
extern const char *blkStr[2];
OCLPerfBufferReadSpeed::OCLPerfBufferReadSpeed() {
_numSubTests = NUM_SIZES * NUM_SUBTESTS * 2;
blockedSubtests = _numSubTests;
_numSubTests += NUM_SIZES * NUM_SUBTESTS;
}
OCLPerfBufferReadSpeed::~OCLPerfBufferReadSpeed() {}
static void CL_CALLBACK notify_callback(const char *errinfo,
const void *private_info, size_t cb,
void *user_data) {}
void OCLPerfBufferReadSpeed::open(unsigned int test, char *units,
double &conversion, unsigned int deviceId) {
cl_uint numPlatforms;
cl_platform_id platform = NULL;
cl_uint num_devices = 0;
cl_device_id *devices = NULL;
cl_device_id device = NULL;
_crcword = 0;
conversion = 1.0f;
_deviceId = deviceId;
_openTest = test;
context_ = 0;
cmd_queue_ = 0;
outBuffer_ = 0;
persistent = false;
allocHostPtr = false;
useHostPtr = false;
hostMem = NULL;
alignedMem = NULL;
alignment = 4096;
isAMD = false;
error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
if (0 < numPlatforms) {
cl_platform_id *platforms = new cl_platform_id[numPlatforms];
error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
#if 0
// Get last for default
platform = platforms[numPlatforms-1];
for (unsigned i = 0; i < numPlatforms; ++i) {
#endif
platform = platforms[_platformIndex];
char pbuf[100];
error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
NULL);
num_devices = 0;
/* Get the number of requested devices */
error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
&num_devices);
// Runtime returns an error when no GPU devices are present instead of just
// returning 0 devices
// CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
// Choose platform with GPU devices
if (num_devices > 0) {
if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
isAMD = true;
}
// platform = platforms[_platformIndex];
// break;
}
#if 0
}
#endif
delete platforms;
}
/*
* If we could find our platform, use it. If not, die as we need the AMD
* platform for these extensions.
*/
CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
char getVersion[128];
error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION,
sizeof(getVersion), getVersion, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
platformVersion[0] = getVersion[7];
platformVersion[1] = getVersion[8];
platformVersion[2] = getVersion[9];
platformVersion[3] = '\0';
bufSize_ = Sizes[_openTest % NUM_SIZES];
if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 2) {
useHostPtr = true;
offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 3];
} else if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 2) && isAMD) {
persistent = true;
} else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 1) {
allocHostPtr = true;
}
if (_openTest < blockedSubtests) {
numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS)];
} else {
numIter =
4 * OCLPerfBufferReadSpeed::NUM_ITER / ((_openTest % NUM_SIZES) + 1);
}
devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
CHECK_RESULT(devices == 0, "no devices");
/* Get the requested device */
error_ =
_wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
device = devices[_deviceId];
context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
&error_);
CHECK_RESULT(context_ == 0, "clCreateContext failed");
cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
cl_mem_flags flags = CL_MEM_WRITE_ONLY;
if (persistent) {
flags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
} else if (allocHostPtr) {
flags |= CL_MEM_ALLOC_HOST_PTR;
} else if (useHostPtr) {
flags |= CL_MEM_USE_HOST_PTR;
hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset);
CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed");
alignedMem =
(char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) +
offset);
}
outBuffer_ =
_wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_);
CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
// Force memory to be on GPU if possible
{
cl_mem memBuffer =
_wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed");
_wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, outBuffer_, 0, 0,
bufSize_, 0, NULL, NULL);
_wrapper->clFinish(cmd_queue_);
_wrapper->clReleaseMemObject(memBuffer);
}
}
void OCLPerfBufferReadSpeed::run(void) {
CPerfCounter timer;
char *mem = new char[bufSize_];
cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE;
// Warm up
error_ = _wrapper->clEnqueueReadBuffer(cmd_queue_, outBuffer_, CL_TRUE, 0,
bufSize_, mem, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueReadBuffer failed");
timer.Reset();
timer.Start();
for (unsigned int i = 0; i < numIter; i++) {
error_ = _wrapper->clEnqueueReadBuffer(cmd_queue_, outBuffer_, blocking, 0,
bufSize_, mem, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueReadBuffer failed");
}
if (blocking != CL_TRUE) {
_wrapper->clFinish(cmd_queue_);
}
timer.Stop();
double sec = timer.GetElapsedTime();
// Buffer read bandwidth in GB/s
double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
_perfInfo = (float)perf;
char str[256];
if (persistent) {
SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)");
} else if (allocHostPtr) {
SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)");
} else if (useHostPtr) {
SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset);
} else {
SNPRINTF(str, sizeof(str), "(GB/s)");
}
char buf[256];
SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %29s ", bufSize_,
blkStr[blocking], numIter, str);
testDescString = buf;
delete mem;
}
unsigned int OCLPerfBufferReadSpeed::close(void) {
if (outBuffer_) {
error_ = _wrapper->clReleaseMemObject(outBuffer_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseMemObject(outBuffer_) failed");
}
if (cmd_queue_) {
error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseCommandQueue failed");
}
if (context_) {
error_ = _wrapper->clReleaseContext(context_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
}
if (hostMem) {
free(hostMem);
}
return _crcword;
}
void OCLPerfBufferReadRectSpeed::run(void) {
CPerfCounter timer;
char *mem = new char[bufSize_];
size_t width = static_cast<size_t>(sqrt(static_cast<float>(bufSize_)));
size_t bufOrigin[3] = {0, 0, 0};
size_t hostOrigin[3] = {0, 0, 0};
size_t region[3] = {width, width, 1};
cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE;
// Clamp iterations to reduce run time
unsigned int testNumIter;
testNumIter = (numIter < 100 ? numIter : 100);
// Skip for 1.0 platforms
if ((platformVersion[0] == '1') && (platformVersion[2] == '0')) {
char buf[256];
SNPRINTF(buf, sizeof(buf), " SKIPPED ");
testDescString = buf;
return;
}
// Warm up
error_ = _wrapper->clEnqueueReadBufferRect(
cmd_queue_, outBuffer_, CL_TRUE, bufOrigin, hostOrigin, region, width, 0,
width, 0, mem, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueReadBufferRect failed");
timer.Reset();
timer.Start();
for (unsigned int i = 0; i < testNumIter; i++) {
error_ = _wrapper->clEnqueueReadBufferRect(
cmd_queue_, outBuffer_, blocking, bufOrigin, hostOrigin, region, width,
0, width, 0, mem, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueReadBufferRect failed");
}
if (blocking != CL_TRUE) {
_wrapper->clFinish(cmd_queue_);
}
timer.Stop();
double sec = timer.GetElapsedTime();
// Buffer read bandwidth in GB/s
double perf = ((double)bufSize_ * testNumIter * (double)(1e-09)) / sec;
_perfInfo = (float)perf;
char str[256];
if (persistent) {
SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)");
} else if (allocHostPtr) {
SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)");
} else if (useHostPtr) {
SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset);
} else {
SNPRINTF(str, sizeof(str), "(GB/s)");
}
char buf[256];
SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %29s ", bufSize_,
blkStr[blocking], numIter, str);
testDescString = buf;
delete mem;
}
@@ -0,0 +1,65 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_BufferReadSpeed_H_
#define _OCL_BufferReadSpeed_H_
#include "OCLTestImp.h"
class OCLPerfBufferReadSpeed : public OCLTestImp {
public:
OCLPerfBufferReadSpeed();
virtual ~OCLPerfBufferReadSpeed();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
static const unsigned int NUM_ITER = 1000;
cl_context context_;
cl_command_queue cmd_queue_;
cl_mem outBuffer_;
cl_int error_;
unsigned int bufSize_;
bool persistent;
bool allocHostPtr;
bool useHostPtr;
unsigned int numIter;
char* hostMem;
char* alignedMem;
size_t alignment;
unsigned int offset;
bool isAMD;
char platformVersion[32];
};
class OCLPerfBufferReadRectSpeed : public OCLPerfBufferReadSpeed {
public:
OCLPerfBufferReadRectSpeed() : OCLPerfBufferReadSpeed() {}
public:
virtual void run(void);
};
#endif // _OCL_BufferReadSpeed_H_
@@ -0,0 +1,333 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerfBufferWriteSpeed.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <complex>
#include "CL/opencl.h"
#include "Timer.h"
// Quiet pesky warnings
#ifdef WIN_OS
#define SNPRINTF sprintf_s
#else
#define SNPRINTF snprintf
#endif
#define NUM_SIZES 8
// 256KB, 1 MB, 4MB, 16 MB
static const unsigned int Sizes[NUM_SIZES] = {
1024, 32 * 1024, 64 * 1024, 128 * 1024, 262144, 1048576, 4194304, 16777216};
static cl_uint blockedSubtests;
static const unsigned int Iterations[2] = {1,
OCLPerfBufferWriteSpeed::NUM_ITER};
#define NUM_OFFSETS 1
static const unsigned int offsets[NUM_OFFSETS] = {0};
#define NUM_SUBTESTS (3 + NUM_OFFSETS)
extern const char *blkStr[2];
OCLPerfBufferWriteSpeed::OCLPerfBufferWriteSpeed() {
_numSubTests = NUM_SIZES * NUM_SUBTESTS * 2;
blockedSubtests = _numSubTests;
_numSubTests += NUM_SIZES * NUM_SUBTESTS;
}
OCLPerfBufferWriteSpeed::~OCLPerfBufferWriteSpeed() {}
static void CL_CALLBACK notify_callback(const char *errinfo,
const void *private_info, size_t cb,
void *user_data) {}
void OCLPerfBufferWriteSpeed::open(unsigned int test, char *units,
double &conversion, unsigned int deviceId) {
cl_uint numPlatforms;
cl_platform_id platform = NULL;
cl_uint num_devices = 0;
cl_device_id *devices = NULL;
cl_device_id device = NULL;
_crcword = 0;
conversion = 1.0f;
_deviceId = deviceId;
_openTest = test;
context_ = 0;
cmd_queue_ = 0;
outBuffer_ = 0;
persistent = false;
allocHostPtr = false;
useHostPtr = false;
hostMem = NULL;
alignedMem = NULL;
alignment = 4096;
isAMD = false;
error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
if (0 < numPlatforms) {
cl_platform_id *platforms = new cl_platform_id[numPlatforms];
error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
#if 0
// Get last for default
platform = platforms[numPlatforms-1];
for (unsigned i = 0; i < numPlatforms; ++i) {
#endif
platform = platforms[_platformIndex];
char pbuf[100];
error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
NULL);
num_devices = 0;
/* Get the number of requested devices */
error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
&num_devices);
// Runtime returns an error when no GPU devices are present instead of just
// returning 0 devices
// CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
// Choose platform with GPU devices
if (num_devices > 0) {
if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
isAMD = true;
}
// platform = platforms[_platformIndex];
// break;
}
#if 0
}
#endif
delete platforms;
}
/*
* If we could find our platform, use it. If not, die as we need the AMD
* platform for these extensions.
*/
CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
char getVersion[128];
error_ = _wrapper->clGetPlatformInfo(platform, CL_PLATFORM_VERSION,
sizeof(getVersion), getVersion, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformInfo failed");
platformVersion[0] = getVersion[7];
platformVersion[1] = getVersion[8];
platformVersion[2] = getVersion[9];
platformVersion[3] = '\0';
bufSize_ = Sizes[_openTest % NUM_SIZES];
if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 2) {
useHostPtr = true;
offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 3];
} else if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 2) && isAMD) {
persistent = true;
} else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 1) {
allocHostPtr = true;
}
if (_openTest < blockedSubtests) {
numIter = Iterations[_openTest / (NUM_SIZES * NUM_SUBTESTS)];
} else {
numIter =
4 * OCLPerfBufferWriteSpeed::NUM_ITER / ((_openTest % NUM_SIZES) + 1);
}
devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
CHECK_RESULT(devices == 0, "no devices");
/* Get the requested device */
error_ =
_wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
device = devices[_deviceId];
context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
&error_);
CHECK_RESULT(context_ == 0, "clCreateContext failed");
cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
cl_mem_flags flags = CL_MEM_READ_ONLY;
if (persistent) {
flags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
} else if (allocHostPtr) {
flags |= CL_MEM_ALLOC_HOST_PTR;
} else if (useHostPtr) {
flags |= CL_MEM_USE_HOST_PTR;
hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset);
CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed");
alignedMem =
(char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) +
offset);
}
outBuffer_ =
_wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_);
CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
// Force memory to be on GPU if possible
{
cl_mem memBuffer =
_wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed");
_wrapper->clEnqueueCopyBuffer(cmd_queue_, outBuffer_, memBuffer, 0, 0,
bufSize_, 0, NULL, NULL);
_wrapper->clFinish(cmd_queue_);
_wrapper->clReleaseMemObject(memBuffer);
}
}
void OCLPerfBufferWriteSpeed::run(void) {
CPerfCounter timer;
char *mem = new char[bufSize_];
cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE;
// Warm up
error_ = _wrapper->clEnqueueWriteBuffer(cmd_queue_, outBuffer_, CL_TRUE, 0,
bufSize_, mem, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueReadBuffer failed");
timer.Reset();
timer.Start();
for (unsigned int i = 0; i < numIter; i++) {
error_ = _wrapper->clEnqueueWriteBuffer(cmd_queue_, outBuffer_, blocking, 0,
bufSize_, mem, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueReadBuffer failed");
}
if (blocking != CL_TRUE) {
_wrapper->clFinish(cmd_queue_);
}
timer.Stop();
double sec = timer.GetElapsedTime();
// Buffer write bandwidth in GB/s
double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
_perfInfo = (float)perf;
char str[256];
if (persistent) {
SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)");
} else if (allocHostPtr) {
SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)");
} else if (useHostPtr) {
SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset);
} else {
SNPRINTF(str, sizeof(str), "(GB/s)");
}
char buf[256];
SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %29s ", bufSize_,
blkStr[blocking], numIter, str);
testDescString = buf;
delete mem;
}
unsigned int OCLPerfBufferWriteSpeed::close(void) {
if (outBuffer_) {
error_ = _wrapper->clReleaseMemObject(outBuffer_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseMemObject(outBuffer_) failed");
}
if (cmd_queue_) {
error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseCommandQueue failed");
}
if (context_) {
error_ = _wrapper->clReleaseContext(context_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
}
if (hostMem) {
free(hostMem);
}
return _crcword;
}
void OCLPerfBufferWriteRectSpeed::run(void) {
CPerfCounter timer;
char *mem = new char[bufSize_];
size_t width = static_cast<size_t>(sqrt(static_cast<float>(bufSize_)));
size_t bufOrigin[3] = {0, 0, 0};
size_t hostOrigin[3] = {0, 0, 0};
size_t region[3] = {width, width, 1};
cl_bool blocking = (_openTest < blockedSubtests) ? CL_TRUE : CL_FALSE;
// Skip for 1.0 platforms
if ((platformVersion[0] == '1') && (platformVersion[2] == '0')) {
char buf[256];
SNPRINTF(buf, sizeof(buf), " SKIPPED ");
testDescString = buf;
return;
}
// Warm up
error_ = _wrapper->clEnqueueWriteBufferRect(
cmd_queue_, outBuffer_, CL_TRUE, bufOrigin, hostOrigin, region, width, 0,
width, 0, mem, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueReadBufferRect failed");
timer.Reset();
timer.Start();
for (unsigned int i = 0; i < numIter; i++) {
error_ = _wrapper->clEnqueueWriteBufferRect(
cmd_queue_, outBuffer_, blocking, bufOrigin, hostOrigin, region, width,
0, width, 0, mem, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueReadBufferRect failed");
}
if (blocking != CL_TRUE) {
_wrapper->clFinish(cmd_queue_);
}
timer.Stop();
double sec = timer.GetElapsedTime();
// Buffer write bandwidth in GB/s
double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
_perfInfo = (float)perf;
char str[256];
if (persistent) {
SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)");
} else if (allocHostPtr) {
SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)");
} else if (useHostPtr) {
SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset);
} else {
SNPRINTF(str, sizeof(str), "(GB/s)");
}
char buf[256];
SNPRINTF(buf, sizeof(buf), " (%8d bytes) %3s i: %4d %29s ", bufSize_,
blkStr[blocking], numIter, str);
testDescString = buf;
delete mem;
}
@@ -0,0 +1,65 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_BufferWriteSpeed_H_
#define _OCL_BufferWriteSpeed_H_
#include "OCLTestImp.h"
class OCLPerfBufferWriteSpeed : public OCLTestImp {
public:
OCLPerfBufferWriteSpeed();
virtual ~OCLPerfBufferWriteSpeed();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
static const unsigned int NUM_ITER = 1000;
cl_context context_;
cl_command_queue cmd_queue_;
cl_mem outBuffer_;
cl_int error_;
unsigned int bufSize_;
bool persistent;
bool allocHostPtr;
bool useHostPtr;
unsigned int numIter;
char* hostMem;
char* alignedMem;
size_t alignment;
unsigned int offset;
bool isAMD;
char platformVersion[32];
};
class OCLPerfBufferWriteRectSpeed : public OCLPerfBufferWriteSpeed {
public:
OCLPerfBufferWriteRectSpeed() : OCLPerfBufferWriteSpeed() {}
public:
virtual void run(void);
};
#endif // _OCL_BufferWriteSpeed_H_
@@ -0,0 +1,304 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerfCPUMemSpeed.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <algorithm>
#include "CL/opencl.h"
#include "Timer.h"
// Quiet pesky warnings
#ifdef WIN_OS
#define SNPRINTF sprintf_s
#else
#define SNPRINTF snprintf
#endif
#define NUM_SIZES 4
// 256KB, 1 MB, 4MB, 16 MB
static const unsigned int Sizes[NUM_SIZES] = {262144, 1048576, 4194304,
16777216};
#define ITER_COUNT 2
static const unsigned int Iterations[2] = {1, OCLPerfCPUMemSpeed::NUM_ITER};
#define NUM_OFFSETS 1
static const unsigned int offsets[NUM_OFFSETS] = {0};
#define NUM_SUBTESTS (3 + NUM_OFFSETS)
OCLPerfCPUMemSpeed::OCLPerfCPUMemSpeed() {
_numSubTests = NUM_SIZES * NUM_SUBTESTS * ITER_COUNT * 3;
}
OCLPerfCPUMemSpeed::~OCLPerfCPUMemSpeed() {}
static void CL_CALLBACK notify_callback(const char *errinfo,
const void *private_info, size_t cb,
void *user_data) {}
void OCLPerfCPUMemSpeed::open(unsigned int test, char *units,
double &conversion, unsigned int deviceId) {
cl_uint numPlatforms;
cl_platform_id platform = NULL;
cl_uint num_devices = 0;
cl_device_id *devices = NULL;
cl_device_id device = NULL;
_crcword = 0;
conversion = 1.0f;
_deviceId = deviceId;
_openTest = test;
context_ = 0;
cmd_queue_ = 0;
outBuffer_ = 0;
persistent = false;
allocHostPtr = false;
useHostPtr = false;
hostMem = NULL;
alignedMem = NULL;
alignment = 4096;
testMemset = false;
isAMD = false;
gpuSrc = false;
error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
if (0 < numPlatforms) {
cl_platform_id *platforms = new cl_platform_id[numPlatforms];
error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
#if 0
// Get last for default
platform = platforms[numPlatforms-1];
for (unsigned i = 0; i < numPlatforms; ++i) {
#endif
platform = platforms[_platformIndex];
char pbuf[100];
error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
NULL);
num_devices = 0;
if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) {
isAMD = true;
}
/* Get the number of requested devices */
error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
&num_devices);
CHECK_RESULT(num_devices == 0, "No devices found, cannot proceed");
// Runtime returns an error when no GPU devices are present instead of just
// returning 0 devices
// CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
// Choose platform with GPU devices
// if (num_devices > 0)
//{
// platform = platforms[_platformIndex];
// break;
//}
#if 0
}
#endif
delete platforms;
}
/*
* If we could find our platform, use it. If not, die as we need the AMD
* platform for these extensions.
*/
CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
bufSize_ = Sizes[_openTest % NUM_SIZES];
if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) > 2) {
useHostPtr = true;
offset = offsets[((_openTest / NUM_SIZES) % NUM_SUBTESTS) - 3];
} else if ((((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 2) && isAMD) {
persistent = true;
} else if (((_openTest / NUM_SIZES) % NUM_SUBTESTS) == 1) {
allocHostPtr = true;
}
numIter = Iterations[(_openTest / (NUM_SIZES * NUM_SUBTESTS)) % 2];
if (_openTest >= (NUM_SIZES * NUM_SUBTESTS * ITER_COUNT * 2))
testMemset = true;
else if (_openTest >= (NUM_SIZES * NUM_SUBTESTS * ITER_COUNT)) {
gpuSrc = true;
numIter = std::min(numIter, 10u);
}
devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
CHECK_RESULT(devices == 0, "no devices");
/* Get the requested device */
error_ =
_wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
device = devices[_deviceId];
context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
&error_);
CHECK_RESULT(context_ == 0, "clCreateContext failed");
cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
cl_mem_flags flags;
if (gpuSrc) {
flags = CL_MEM_WRITE_ONLY;
mapFlags = CL_MAP_READ;
} else {
flags = CL_MEM_READ_ONLY;
mapFlags = CL_MAP_WRITE;
}
if (persistent) {
flags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
} else if (allocHostPtr) {
flags |= CL_MEM_ALLOC_HOST_PTR;
} else if (useHostPtr) {
flags |= CL_MEM_USE_HOST_PTR;
hostMem = (char *)malloc(bufSize_ + alignment - 1 + offset);
CHECK_RESULT(hostMem == 0, "malloc(hostMem) failed");
alignedMem =
(char *)((((intptr_t)hostMem + alignment - 1) & ~(alignment - 1)) +
offset);
}
outBuffer_ =
_wrapper->clCreateBuffer(context_, flags, bufSize_, alignedMem, &error_);
CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
// Force memory to be on GPU if possible
{
cl_mem memBuffer =
_wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
CHECK_RESULT(memBuffer == 0, "clCreateBuffer(memBuffer) failed");
_wrapper->clEnqueueCopyBuffer(cmd_queue_, memBuffer, outBuffer_, 0, 0,
bufSize_, 0, NULL, NULL);
_wrapper->clFinish(cmd_queue_);
_wrapper->clReleaseMemObject(memBuffer);
}
}
void OCLPerfCPUMemSpeed::run(void) {
CPerfCounter timer;
void *mem;
// Warm up
mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE, mapFlags,
0, bufSize_, 0, NULL, NULL, &error_);
CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0,
NULL, NULL);
CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed");
error_ = _wrapper->clFinish(cmd_queue_);
CHECK_RESULT(error_, "clFinish failed");
mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer_, CL_TRUE, mapFlags,
0, bufSize_, 0, NULL, NULL, &error_);
char *cpumem = new char[bufSize_];
timer.Reset();
timer.Start();
if (testMemset) {
for (unsigned int i = 0; i < numIter; i++) {
memset(mem, 0, bufSize_);
}
} else {
if (gpuSrc) {
for (unsigned int i = 0; i < numIter; i++) {
memcpy((void *)cpumem, mem, bufSize_);
}
} else {
for (unsigned int i = 0; i < numIter; i++) {
memcpy(mem, (void *)cpumem, bufSize_);
}
}
}
timer.Stop();
delete[] cpumem;
CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer_, mem, 0,
NULL, NULL);
CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed");
error_ = _wrapper->clFinish(cmd_queue_);
CHECK_RESULT(error_, "clFinish failed");
double sec = timer.GetElapsedTime();
// Map read bandwidth in GB/s
double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec;
_perfInfo = (float)perf;
char str[256];
if (persistent) {
SNPRINTF(str, sizeof(str), "PERSISTENT (GB/s)");
} else if (allocHostPtr) {
SNPRINTF(str, sizeof(str), "ALLOC_HOST_PTR (GB/s)");
} else if (useHostPtr) {
SNPRINTF(str, sizeof(str), "off: %4d USE_HOST_PTR (GB/s)", offset);
} else {
SNPRINTF(str, sizeof(str), "(GB/s)");
}
const char *str2 = NULL;
if (testMemset)
str2 = "memset to dev";
else {
if (gpuSrc)
str2 = "memcpy from dev";
else
str2 = "memcpy to dev";
}
char buf[256];
SNPRINTF(buf, sizeof(buf), " (%8d bytes) %15s i: %4d %29s ", bufSize_, str2,
numIter, str);
testDescString = buf;
}
unsigned int OCLPerfCPUMemSpeed::close(void) {
if (outBuffer_) {
error_ = _wrapper->clReleaseMemObject(outBuffer_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseMemObject(outBuffer_) failed");
}
if (cmd_queue_) {
error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseCommandQueue failed");
}
if (context_) {
error_ = _wrapper->clReleaseContext(context_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
}
if (hostMem) {
free(hostMem);
}
return _crcword;
}
@@ -0,0 +1,59 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_CPUMemSpeed_H_
#define _OCL_CPUMemSpeed_H_
#include "OCLTestImp.h"
class OCLPerfCPUMemSpeed : public OCLTestImp {
public:
OCLPerfCPUMemSpeed();
virtual ~OCLPerfCPUMemSpeed();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
static const unsigned int NUM_ITER = 100;
cl_context context_;
cl_command_queue cmd_queue_;
cl_mem outBuffer_;
cl_int error_;
unsigned int bufSize_;
bool persistent;
bool allocHostPtr;
bool useHostPtr;
unsigned int numIter;
bool testMemset;
char* hostMem;
char* alignedMem;
size_t alignment;
unsigned int offset;
bool isAMD;
bool gpuSrc;
cl_map_flags mapFlags;
};
#endif // _OCL_CPUMemSpeed_H_
@@ -0,0 +1,146 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerfCommandQueue.h"
#include <Timer.h>
#include <assert.h>
#include <stdio.h>
#include <sstream>
#include <string>
#include "CL/cl.h"
#include "CL/cl_ext.h"
static const size_t BufSize = 0x1000;
static const size_t Iterations = 0x100;
static const size_t TotalQueues = 4;
static const size_t TotalBufs = 4;
OCLPerfCommandQueue::OCLPerfCommandQueue() {
_numSubTests = TotalQueues * TotalBufs;
failed_ = false;
}
OCLPerfCommandQueue::~OCLPerfCommandQueue() {}
void OCLPerfCommandQueue::open(unsigned int test, char* units,
double& conversion, unsigned int deviceId) {
cl_mem buffer;
_deviceId = deviceId;
CPerfCounter timer;
timer.Reset();
timer.Start();
OCLTestImp::open(test, units, conversion, deviceId);
CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
timer.Stop();
if (test == 0) {
printf("Runtime load/init time: %0.2f ms\n",
static_cast<float>(timer.GetElapsedTime() * 1000));
}
test_ = test;
cl_device_type deviceType;
error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
sizeof(deviceType), &deviceType, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
printf("GPU device is required for this test!\n");
failed_ = true;
return;
}
static const size_t MemObjects[] = {1, 100, 1000, 5000};
size_t numMems = MemObjects[test_ / TotalBufs];
size_t bufSize = BufSize * sizeof(cl_int4);
for (size_t b = 0; b < numMems; ++b) {
buffer = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, bufSize,
NULL, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
buffers_.push_back(buffer);
}
}
static void CL_CALLBACK notify_callback(const char* errinfo,
const void* private_info, size_t cb,
void* user_data) {}
void OCLPerfCommandQueue::run(void) {
if (failed_) {
return;
}
unsigned int* values;
values = reinterpret_cast<unsigned int*>(new cl_int4[BufSize]);
CPerfCounter timer;
static const size_t Queues[] = {1, 2, 4, 8};
size_t numQueues = Queues[test_ % TotalQueues];
// Clear destination buffer
memset(values, 0, BufSize * sizeof(cl_int4));
size_t iter =
Iterations / (numQueues * ((size_t)1 << (test_ / TotalBufs + 1)));
std::vector<cl_command_queue> cmdQueues(numQueues);
timer.Reset();
timer.Start();
for (size_t i = 0; i < iter; ++i) {
for (size_t q = 0; q < numQueues; ++q) {
cl_command_queue cmdQueue = _wrapper->clCreateCommandQueue(
context_, devices_[_deviceId], 0, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed");
cmdQueues[q] = cmdQueue;
}
timer.Stop();
for (size_t q = 0; q < numQueues; ++q) {
for (size_t b = 0; b < buffers_.size(); ++b) {
error_ = _wrapper->clEnqueueWriteBuffer(cmdQueues[q], buffers_[b],
CL_TRUE, 0, sizeof(cl_int4),
values, 0, NULL, NULL);
}
}
timer.Start();
for (size_t q = 0; q < numQueues; ++q) {
error_ = _wrapper->clReleaseCommandQueue(cmdQueues[q]);
CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
"clReleaseCommandQueue() failed");
}
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueWriteBuffer() failed");
}
timer.Stop();
std::stringstream stream;
stream << "Create+destroy time for " << numQueues << " queues and "
<< buffers_.size() << " buffers";
stream.precision(3);
stream.width(5);
stream.setf(std::ios::fixed, std::ios::floatfield);
stream << "(ms)";
testDescString = stream.str();
_perfInfo =
static_cast<float>(timer.GetElapsedTime() * 1000 / (iter * numQueues));
delete[] values;
}
unsigned int OCLPerfCommandQueue::close(void) { return OCLTestImp::close(); }
@@ -0,0 +1,42 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_PERF_COMMAND_QUEUE_H_
#define _OCL_PERF_COMMAND_QUEUE_H_
#include "OCLTestImp.h"
class OCLPerfCommandQueue : public OCLTestImp {
public:
OCLPerfCommandQueue();
virtual ~OCLPerfCommandQueue();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
private:
bool failed_;
unsigned int test_;
};
#endif // _OCL_PERF_COMMAND_QUEUE_H_
@@ -0,0 +1,563 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerfConcurrency.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include "CL/cl.h"
#include "Timer.h"
// Quiet pesky warnings
#ifdef WIN_OS
#define SNPRINTF sprintf_s
#else
#define SNPRINTF snprintf
#endif
typedef struct {
double x;
double y;
double width;
} coordRec;
static coordRec coords[] = {
{0.0, 0.0, 0.00001}, // All black
};
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
static const char *float_mandel_vec =
"__kernel void mandelbrot(__global uint *out, uint width, float xPos, "
"float yPos, float xStep, float yStep, uint maxIter)\n"
"{\n"
" int tid = get_global_id(0);\n"
" int i = tid % (width/4);\n"
" int j = tid / (width/4);\n"
" int4 veci = (int4)(4*i, 4*i+1, 4*i+2, 4*i+3);\n"
" int4 vecj = (int4)(j, j, j, j);\n"
" float4 x0;\n"
" x0.s0 = (float)(xPos + xStep*veci.s0);\n"
" x0.s1 = (float)(xPos + xStep*veci.s1);\n"
" x0.s2 = (float)(xPos + xStep*veci.s2);\n"
" x0.s3 = (float)(xPos + xStep*veci.s3);\n"
" float4 y0;\n"
" y0.s0 = (float)(yPos + yStep*vecj.s0);\n"
" y0.s1 = (float)(yPos + yStep*vecj.s1);\n"
" y0.s2 = (float)(yPos + yStep*vecj.s2);\n"
" y0.s3 = (float)(yPos + yStep*vecj.s3);\n"
"\n"
" float4 x = x0;\n"
" float4 y = y0;\n"
"\n"
" uint iter = 0;\n"
" float4 tmp;\n"
" int4 stay;\n"
" int4 ccount = 0;\n"
" float4 savx = x;\n"
" float4 savy = y;\n"
" stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n"
" for (iter = 0; (stay.s0 | stay.s1 | stay.s2 | stay.s3) && (iter < "
"maxIter); iter+=16)\n"
" {\n"
" x = savx;\n"
" y = savy;\n"
"\n"
" // Two iterations\n"
" tmp = x*x + x0 - y*y;\n"
" y = 2.0f * x * y + y0;\n"
" x = tmp*tmp + x0 - y*y;\n"
" y = 2.0f * tmp * y + y0;\n"
"\n"
" // Two iterations\n"
" tmp = x*x + x0 - y*y;\n"
" y = 2.0f * x * y + y0;\n"
" x = tmp*tmp + x0 - y*y;\n"
" y = 2.0f * tmp * y + y0;\n"
"\n"
" // Two iterations\n"
" tmp = x*x + x0 - y*y;\n"
" y = 2.0f * x * y + y0;\n"
" x = tmp*tmp + x0 - y*y;\n"
" y = 2.0f * tmp * y + y0;\n"
"\n"
" // Two iterations\n"
" tmp = x*x + x0 - y*y;\n"
" y = 2.0f * x * y + y0;\n"
" x = tmp*tmp + x0 - y*y;\n"
" y = 2.0f * tmp * y + y0;\n"
"\n"
" // Two iterations\n"
" tmp = x*x + x0 - y*y;\n"
" y = 2.0f * x * y + y0;\n"
" x = tmp*tmp + x0 - y*y;\n"
" y = 2.0f * tmp * y + y0;\n"
"\n"
" // Two iterations\n"
" tmp = x*x + x0 - y*y;\n"
" y = 2.0f * x * y + y0;\n"
" x = tmp*tmp + x0 - y*y;\n"
" y = 2.0f * tmp * y + y0;\n"
"\n"
" // Two iterations\n"
" tmp = x*x + x0 - y*y;\n"
" y = 2.0f * x * y + y0;\n"
" x = tmp*tmp + x0 - y*y;\n"
" y = 2.0f * tmp * y + y0;\n"
"\n"
" // Two iterations\n"
" tmp = x*x + x0 - y*y;\n"
" y = 2.0f * x * y + y0;\n"
" x = tmp*tmp + x0 - y*y;\n"
" y = 2.0f * tmp * y + y0;\n"
"\n"
" stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n"
" savx = (stay ? x : savx);\n"
" savy = (stay ? y : savy);\n"
" ccount -= stay*16;\n"
" }\n"
" // Handle remainder\n"
" if (!(stay.s0 & stay.s1 & stay.s2 & stay.s3))\n"
" {\n"
" iter = 16;\n"
" do\n"
" {\n"
" x = savx;\n"
" y = savy;\n"
" // More efficient to use scalar ops here: Why?\n"
" stay.s0 = ((x.s0*x.s0+y.s0*y.s0) <= 4.0f) && (ccount.s0 < "
"maxIter);\n"
" stay.s1 = ((x.s1*x.s1+y.s1*y.s1) <= 4.0f) && (ccount.s1 < "
"maxIter);\n"
" stay.s2 = ((x.s2*x.s2+y.s2*y.s2) <= 4.0f) && (ccount.s2 < "
"maxIter);\n"
" stay.s3 = ((x.s3*x.s3+y.s3*y.s3) <= 4.0f) && (ccount.s3 < "
"maxIter);\n"
" tmp = x;\n"
" x = x*x + x0 - y*y;\n"
" y = 2.0f*tmp*y + y0;\n"
" ccount += stay;\n"
" iter--;\n"
" savx.s0 = (stay.s0 ? x.s0 : savx.s0);\n"
" savx.s1 = (stay.s1 ? x.s1 : savx.s1);\n"
" savx.s2 = (stay.s2 ? x.s2 : savx.s2);\n"
" savx.s3 = (stay.s3 ? x.s3 : savx.s3);\n"
" savy.s0 = (stay.s0 ? y.s0 : savy.s0);\n"
" savy.s1 = (stay.s1 ? y.s1 : savy.s1);\n"
" savy.s2 = (stay.s2 ? y.s2 : savy.s2);\n"
" savy.s3 = (stay.s3 ? y.s3 : savy.s3);\n"
" } while ((stay.s0 | stay.s1 | stay.s2 | stay.s3) && iter);\n"
" }\n"
" __global uint4 *vecOut = (__global uint4 *)out;\n"
" vecOut[tid] = convert_uint4(ccount);\n"
"}\n";
OCLPerfConcurrency::OCLPerfConcurrency() { _numSubTests = 10 * numCoords; }
OCLPerfConcurrency::~OCLPerfConcurrency() {}
void OCLPerfConcurrency::setData(cl_mem buffer, unsigned int val) {
unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
cmd_queue_[0], buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL,
&error_);
for (unsigned int i = 0; i < width_; i++) data[i] = val;
error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_[0], buffer, data, 0,
NULL, NULL);
_wrapper->clFinish(cmd_queue_[0]);
}
void OCLPerfConcurrency::checkData(cl_mem buffer) {
unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
cmd_queue_[0], buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL,
&error_);
totalIters = 0;
for (unsigned int i = 0; i < width_; i++) {
totalIters += data[i];
}
error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_[0], buffer, data, 0,
NULL, NULL);
_wrapper->clFinish(cmd_queue_[0]);
}
static void CL_CALLBACK notify_callback(const char *errinfo,
const void *private_info, size_t cb,
void *user_data) {}
void OCLPerfConcurrency::open(unsigned int test, char *units,
double &conversion, unsigned int deviceId) {
cl_uint numPlatforms;
cl_platform_id platform = NULL;
cl_uint num_devices = 0;
cl_device_id *devices = NULL;
cl_device_id device = NULL;
unsigned int i;
if (type_ != CL_DEVICE_TYPE_GPU) {
char msg[256];
SNPRINTF(msg, sizeof(msg), "No GPU devices present. Exiting!\t");
testDescString = msg;
return;
}
_crcword = 0;
conversion = 1.0f;
_deviceId = deviceId;
_openTest = test;
context_ = 0;
for (i = 0; i < MAX_ASYNC_QUEUES; i++) {
cmd_queue_[i] = 0;
program_[i] = 0;
kernel_[i] = 0;
outBuffer_[i] = 0;
}
// Maximum iteration count
// NOTE: Some kernels are unrolled 16 times, so make sure maxIter is divisible
// by 16 NOTE: Can increase to get better peak performance numbers, but be
// sure not to TDR slow ASICs! NOTE:. for warmup run we use maxIter = 256 and
// then for the actual run we use maxIter = 8388608 * (engine_clock / 1000).
maxIter = 256;
// NOTE: Width needs to be divisible by 4 because the float_mandel_vec kernel
// processes 4 pixels at once NOTE: Can increase to get better peak
// performance numbers, but be sure not to TDR slow ASICs!
width_ = 256;
// We compute a square domain
bufSize_ = width_ * sizeof(cl_uint);
error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
if (0 < numPlatforms) {
cl_platform_id *platforms = new cl_platform_id[numPlatforms];
error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
#if 0
// Get last for default
platform = platforms[numPlatforms-1];
for (i = 0; i < numPlatforms; ++i) {
#endif
platform = platforms[_platformIndex];
char pbuf[100];
error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
NULL);
num_devices = 0;
/* Get the number of requested devices */
error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
&num_devices);
// Runtime returns an error when no GPU devices are present instead of just
// returning 0 devices
// CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
// Choose platform with GPU devices
// if (num_devices > 0)
//{
// platform = platforms[_platformIndex];
// break;
//}
#if 0
}
#endif
delete platforms;
}
/*
* If we could find our platform, use it. If not, die as we need the AMD
* platform for these extensions.
*/
CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
CHECK_RESULT(devices == 0, "no devices");
/* Get the requested device */
error_ =
_wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
device = devices[_deviceId];
context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
&error_);
CHECK_RESULT(context_ == 0, "clCreateContext failed");
char charbuf[1024];
size_t retsize;
error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024,
charbuf, &retsize);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
cl_uint numAsyncQueues;
error_ = _wrapper->clGetDeviceInfo(
device, CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD, sizeof(numAsyncQueues),
&numAsyncQueues, &retsize);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
CHECK_RESULT(numAsyncQueues > MAX_ASYNC_QUEUES,
"numAsyncQueues is too large for this test");
error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(size_t), &numCUs, &retsize);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
switch (_openTest) {
case 0:
num_cmd_queues = num_programs = num_kernels = num_outbuffers = 1;
break;
case 1:
num_cmd_queues = 1;
num_programs = 1;
num_kernels = 1;
num_outbuffers = 2;
break;
case 2:
num_cmd_queues = 1;
num_programs = 2;
num_kernels = 2;
num_outbuffers = 2;
break;
case 3:
num_cmd_queues = num_programs = num_kernels = num_outbuffers = 2;
break;
case 4:
case 5:
case 6:
case 7:
case 8:
case 9:
num_cmd_queues = num_programs = num_kernels = num_outbuffers =
numAsyncQueues % 8;
break;
default:
break;
}
for (i = 0; i < num_cmd_queues; i++) {
cmd_queue_[i] = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
CHECK_RESULT(cmd_queue_[i] == 0, "clCreateCommandQueue failed");
}
for (i = 0; i < num_outbuffers; i++) {
outBuffer_[i] =
_wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
CHECK_RESULT(outBuffer_[i] == 0, "clCreateBuffer(outBuffer) failed");
}
const char *tmp;
tmp = float_mandel_vec;
for (i = 0; i < num_programs; i++) {
program_[i] = _wrapper->clCreateProgramWithSource(
context_, 1, (const char **)&tmp, NULL, &error_);
CHECK_RESULT(program_[i] == 0, "clCreateProgramWithSource failed");
error_ = _wrapper->clBuildProgram(program_[i], 1, &device, "", NULL, NULL);
if (error_ != CL_SUCCESS) {
cl_int intError;
char log[16384];
intError = _wrapper->clGetProgramBuildInfo(
program_[i], device, CL_PROGRAM_BUILD_LOG, 16384 * sizeof(char), log,
NULL);
printf("Build error -> %s\n", log);
CHECK_RESULT(0, "clBuildProgram failed");
}
}
for (i = 0; i < num_kernels; i++) {
kernel_[i] = _wrapper->clCreateKernel(program_[i], "mandelbrot", &error_);
CHECK_RESULT(kernel_[i] == 0, "clCreateKernel failed");
}
coordIdx = _openTest % numCoords;
float xStep = (float)(coords[coordIdx].width / (double)width_);
float yStep = (float)(-coords[coordIdx].width / (double)width_);
float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
for (i = 0; i < num_kernels; i++) {
error_ = _wrapper->clSetKernelArg(kernel_[i], 0, sizeof(cl_mem),
(void *)&outBuffer_[i]);
error_ = _wrapper->clSetKernelArg(kernel_[i], 1, sizeof(cl_uint),
(void *)&width_);
error_ = _wrapper->clSetKernelArg(kernel_[i], 2, sizeof(cl_float),
(void *)&xPos);
error_ = _wrapper->clSetKernelArg(kernel_[i], 3, sizeof(cl_float),
(void *)&yPos);
error_ = _wrapper->clSetKernelArg(kernel_[i], 4, sizeof(cl_float),
(void *)&xStep);
error_ = _wrapper->clSetKernelArg(kernel_[i], 5, sizeof(cl_float),
(void *)&yStep);
error_ = _wrapper->clSetKernelArg(kernel_[i], 6, sizeof(cl_uint),
(void *)&maxIter);
}
for (i = 0; i < num_outbuffers; i++) {
setData(outBuffer_[i], 0xdeadbeef);
}
unsigned int clkFrequency = 0;
error_ = clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY,
sizeof(clkFrequency), &clkFrequency, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
assert(clkFrequency > 0);
maxIter =
(unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs) / 128);
maxIter = (maxIter + 15) & ~15;
}
void OCLPerfConcurrency::run(void) {
// Test runs only on GPU
if (type_ != CL_DEVICE_TYPE_GPU) return;
int global = width_ >> 2;
// We handle 4 pixels per thread
int local = 64;
size_t global_work_size[1] = {(size_t)global};
size_t local_work_size[1] = {(size_t)local};
unsigned int i;
// Warmup
for (i = 0; i < num_kernels; i++) {
error_ = _wrapper->clEnqueueNDRangeKernel(
cmd_queue_[i % num_cmd_queues], kernel_[i], 1, NULL,
(const size_t *)global_work_size, (const size_t *)local_work_size, 0,
NULL, NULL);
CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
}
for (i = 0; i < num_cmd_queues; i++) {
_wrapper->clFlush(cmd_queue_[i]);
}
for (i = 0; i < num_cmd_queues; i++) {
_wrapper->clFinish(cmd_queue_[i]);
}
for (i = 0; i < num_kernels; i++) {
error_ = _wrapper->clSetKernelArg(kernel_[i], 6, sizeof(cl_uint),
(void *)&maxIter);
}
CPerfCounter timer;
timer.Reset();
timer.Start();
for (i = 0; i < num_kernels; i++) {
error_ = _wrapper->clEnqueueNDRangeKernel(
cmd_queue_[i % num_cmd_queues], kernel_[i], 1, NULL,
(const size_t *)global_work_size, (const size_t *)local_work_size, 0,
NULL, NULL);
CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
}
if (_openTest == 1) {
error_ = _wrapper->clSetKernelArg(kernel_[0], 0, sizeof(cl_mem),
(void *)&outBuffer_[1]);
error_ = _wrapper->clEnqueueNDRangeKernel(
cmd_queue_[0], kernel_[0], 1, NULL, (const size_t *)global_work_size,
(const size_t *)local_work_size, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
}
for (i = 0; i < num_cmd_queues; i++) {
_wrapper->clFlush(cmd_queue_[i]);
}
for (i = 0; i < num_cmd_queues; i++) {
_wrapper->clFinish(cmd_queue_[i]);
}
timer.Stop();
double sec = timer.GetElapsedTime();
unsigned long long expected =
(unsigned long long)width_ * (unsigned long long)maxIter;
for (i = 0; i < num_outbuffers; i++) {
checkData(outBuffer_[i]);
CHECK_RESULT(totalIters != expected, "Incorrect iteration count detected!");
}
_perfInfo = (float)sec;
if (_openTest == 0)
testDescString = "time for 1 kernel (s) ";
else if (_openTest == 1)
testDescString = "time for 2 kernels (s) (same kernel) ";
else if (_openTest == 2)
testDescString = "time for 2 kernels (s) (diff kernels)";
else {
char buf[128];
SNPRINTF(buf, sizeof(buf), "time for %d kernels (s) ( %d queues) ",
num_kernels, num_cmd_queues);
testDescString = buf;
}
}
unsigned int OCLPerfConcurrency::close(void) {
unsigned int i;
// Test runs only on GPU
if (type_ != CL_DEVICE_TYPE_GPU) return 0;
_wrapper->clFinish(cmd_queue_[0]);
for (i = 0; i < num_outbuffers; i++) {
error_ = _wrapper->clReleaseMemObject(outBuffer_[i]);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseMemObject(outBuffer_) failed");
}
for (i = 0; i < num_kernels; i++) {
error_ = _wrapper->clReleaseKernel(kernel_[i]);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseKernel(kernel_) failed");
}
for (i = 0; i < num_programs; i++) {
error_ = _wrapper->clReleaseProgram(program_[i]);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseProgram(program_) failed");
}
for (i = 0; i < num_cmd_queues; i++) {
error_ = _wrapper->clReleaseCommandQueue(cmd_queue_[i]);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseCommandQueue failed");
}
if (context_) {
error_ = _wrapper->clReleaseContext(context_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
}
return _crcword;
}
@@ -0,0 +1,63 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_Perf_Concurrency_H_
#define _OCL_Perf_Concurrency_H_
#include "OCLTestImp.h"
class OCLPerfConcurrency : public OCLTestImp {
public:
OCLPerfConcurrency();
virtual ~OCLPerfConcurrency();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
std::string shader_;
void setData(cl_mem buffer, unsigned int data);
void checkData(cl_mem buffer);
#define MAX_ASYNC_QUEUES 8
cl_context context_;
cl_command_queue cmd_queue_[MAX_ASYNC_QUEUES];
cl_program program_[MAX_ASYNC_QUEUES];
cl_kernel kernel_[MAX_ASYNC_QUEUES];
cl_mem outBuffer_[MAX_ASYNC_QUEUES];
cl_int error_;
unsigned int num_cmd_queues;
unsigned int num_programs;
unsigned int num_kernels;
unsigned int num_outbuffers;
unsigned int width_;
unsigned int bufSize_;
unsigned int maxIter;
unsigned int coordIdx;
unsigned long long totalIters;
size_t numCUs;
};
#endif // _OCL_Perf_Concurrency_H_
@@ -0,0 +1,243 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerfDevMemReadSpeed.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include "CL/opencl.h"
#include "Timer.h"
// Quiet pesky warnings
#ifdef WIN_OS
#define SNPRINTF sprintf_s
#else
#define SNPRINTF snprintf
#endif
#define NUM_SIZES 1
static const unsigned int Sizes[NUM_SIZES] = {256 * 1024 * 1024};
const static char *strKernel =
"__kernel void read_kernel(__global uint16 *src, ulong size1, uint "
"threads, __global uint* dst\n"
" )\n"
"{\n"
" uint16 pval;\n"
" int idx = get_global_id(0);\n"
" __global uint16 *srcEnd = src + size1;\n"
" uint tmp = 0;\n"
" src = &src[idx];"
" while (src < srcEnd) \n"
" {\n"
" pval = *src;\n"
" src += threads;\n"
" tmp += pval.s0 + pval.s1 + pval.s2 + pval.s3 + pval.s4 + pval.s5 + pval.s6 + \
pval.s7 + pval.s8 + pval.s9 + pval.sa + pval.sb + pval.sc + pval.sd + pval.se + pval.sf;\n"
" }\n"
" atomic_add(dst, tmp);\n"
"}\n";
OCLPerfDevMemReadSpeed::OCLPerfDevMemReadSpeed() { _numSubTests = 1; }
OCLPerfDevMemReadSpeed::~OCLPerfDevMemReadSpeed() {}
static void CL_CALLBACK notify_callback(const char *errinfo,
const void *private_info, size_t cb,
void *user_data) {}
void OCLPerfDevMemReadSpeed::open(unsigned int test, char *units,
double &conversion, unsigned int deviceId) {
error_ = CL_SUCCESS;
OCLTestImp::open(test, units, conversion, deviceId);
CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
program_ = 0;
kernel_ = 0;
skip_ = false;
dstBuffer_ = 0;
nBytes = Sizes[0];
cl_ulong loopCnt = nBytes / (16 * sizeof(cl_uint));
cl_uint maxCUs;
error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId],
CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(cl_uint), &maxCUs, 0);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
wgs = 64;
const static cl_uint wavesPerCU = 8;
nWorkItems = maxCUs * wavesPerCU * wgs;
inputData = 0x1;
nIter = 1000;
program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
&error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed");
error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
NULL, NULL);
if (error_ != CL_SUCCESS) {
char programLog[1024];
_wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
printf("\n%s\n", programLog);
fflush(stdout);
}
CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
kernel_ = _wrapper->clCreateKernel(program_, "read_kernel", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
srcBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_READ_ONLY, nBytes,
NULL, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer(srcBuffer) failed");
void *mem;
mem = _wrapper->clEnqueueMapBuffer(cmdQueues_[_deviceId], srcBuffer_, CL_TRUE,
CL_MAP_READ | CL_MAP_WRITE, 0, nBytes, 0,
NULL, NULL, &error_);
CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
for (unsigned int i = 0; i < nBytes / sizeof(cl_uint); ++i) {
reinterpret_cast<cl_uint *>(mem)[i] = inputData;
}
dstBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY,
sizeof(cl_uint), NULL, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer(dstBuffer) failed");
_wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], srcBuffer_, mem, 0,
NULL, NULL);
mem = _wrapper->clEnqueueMapBuffer(cmdQueues_[_deviceId], dstBuffer_, CL_TRUE,
CL_MAP_READ | CL_MAP_WRITE, 0,
sizeof(cl_uint), 0, NULL, NULL, &error_);
CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
memset(mem, 0, sizeof(cl_uint));
_wrapper->clEnqueueUnmapMemObject(cmdQueues_[_deviceId], dstBuffer_, mem, 0,
NULL, NULL);
error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &srcBuffer_);
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
error_ =
_wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_ulong), (void *)&loopCnt);
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint),
(void *)&nWorkItems);
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
error_ =
_wrapper->clSetKernelArg(kernel_, 3, sizeof(cl_mem), (void *)&dstBuffer_);
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
}
void OCLPerfDevMemReadSpeed::run(void) {
if (skip_) {
return;
}
CPerfCounter timer;
size_t gws[1] = {nWorkItems};
size_t lws[1] = {wgs};
// warm up
error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
NULL, gws, lws, 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
_wrapper->clFinish(cmdQueues_[_deviceId]);
cl_uint *memResult;
memResult = (cl_uint *)malloc(sizeof(cl_uint));
if (0 == memResult) {
CHECK_RESULT_NO_RETURN(0, "malloc failed!\n");
return;
}
memset(memResult, 0, sizeof(cl_uint));
error_ = _wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], dstBuffer_,
CL_FALSE, 0, sizeof(cl_uint),
memResult, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueReadBuffer dstBuffer_ failed!");
_wrapper->clFinish(cmdQueues_[_deviceId]);
if (memResult[0] != (nBytes / sizeof(cl_uint))) {
CHECK_RESULT_NO_RETURN(0, "Data validation failed for warm up run!\n");
free(memResult);
return;
}
free(memResult);
timer.Reset();
timer.Start();
double sec2 = 0;
cl_event *events = new cl_event[nIter];
for (unsigned int i = 0; i < nIter; i++) {
error_ = _wrapper->clEnqueueNDRangeKernel(
cmdQueues_[_deviceId], kernel_, 1, NULL, gws, lws, 0, NULL, &events[i]);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
}
_wrapper->clFinish(cmdQueues_[_deviceId]);
timer.Stop();
for (unsigned int i = 0; i < nIter; i++) {
cl_ulong startTime = 0, endTime = 0;
error_ = _wrapper->clGetEventProfilingInfo(
events[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0);
CHECK_RESULT(error_, "clGetEventProfilingInfo failed");
error_ = _wrapper->clGetEventProfilingInfo(
events[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0);
CHECK_RESULT(error_, "clGetEventProfilingInfo failed");
_wrapper->clReleaseEvent(events[i]);
sec2 += endTime - startTime;
}
double sec = timer.GetElapsedTime();
delete[] events;
// read speed in GB/s
double perf = ((double)nBytes * nIter * (double)(1e-09)) / sec;
double perf2 = ((double)nBytes * nIter) / sec2;
_perfInfo = (float)perf2;
float perfInfo = (float)perf;
char buf[256];
SNPRINTF(buf, sizeof(buf), " (%8d bytes) i:%4d Wall time Perf: %.2f (GB/s)",
nBytes, nIter, perfInfo);
testDescString = buf;
}
unsigned int OCLPerfDevMemReadSpeed::close(void) {
if (!skip_) {
if (srcBuffer_) {
error_ = _wrapper->clReleaseMemObject(srcBuffer_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseMemObject(srcBuffer_) failed");
}
if (dstBuffer_) {
error_ = _wrapper->clReleaseMemObject(dstBuffer_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseMemObject(srcBuffer_) failed");
}
}
return OCLTestImp::close();
}
@@ -0,0 +1,47 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_DevMemReadSpeed_H_
#define _OCL_DevMemReadSpeed_H_
#include "OCLTestImp.h"
class OCLPerfDevMemReadSpeed : public OCLTestImp {
public:
OCLPerfDevMemReadSpeed();
virtual ~OCLPerfDevMemReadSpeed();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
cl_mem srcBuffer_;
cl_mem dstBuffer_;
unsigned int nWorkItems; // number of GPU work items
unsigned int wgs; // work group size
unsigned int nBytes; // input and output buffer size
unsigned int nIter; // overall number of timing loops
cl_uint inputData; // input data to fill the input buffer
bool skip_;
};
#endif // _OCL_DevMemReadSpeed_H_
@@ -0,0 +1,212 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerfDevMemWriteSpeed.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include "CL/opencl.h"
#include "Timer.h"
// Quiet pesky warnings
#ifdef WIN_OS
#define SNPRINTF sprintf_s
#else
#define SNPRINTF snprintf
#endif
#define NUM_SIZES 1
static const unsigned int Sizes[NUM_SIZES] = {256 * 1024 * 1024};
const static char *strKernel =
"__kernel void write_kernel(__global uint16 *dst, ulong size1, uint "
"threads\n"
" )\n"
"{\n"
" uint16 pval = (uint16)(0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab,\
0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab, 0xabababab);\n"
" int idx = get_global_id(0);\n"
" __global uint16 *dstEnd = dst + size1;\n"
" dst = &dst[idx];"
" do\n"
" {\n"
" *dst = pval;\n"
" dst += threads;\n"
" }\n"
" while (dst < dstEnd);\n"
"}\n";
OCLPerfDevMemWriteSpeed::OCLPerfDevMemWriteSpeed() { _numSubTests = 1; }
OCLPerfDevMemWriteSpeed::~OCLPerfDevMemWriteSpeed() {}
static void CL_CALLBACK notify_callback(const char *errinfo,
const void *private_info, size_t cb,
void *user_data) {}
void OCLPerfDevMemWriteSpeed::open(unsigned int test, char *units,
double &conversion, unsigned int deviceId) {
error_ = CL_SUCCESS;
OCLTestImp::open(test, units, conversion, deviceId);
CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
program_ = 0;
kernel_ = 0;
skip_ = false;
dstBuffer_ = 0;
nBytes = Sizes[0];
cl_ulong loopCnt = nBytes / (16 * sizeof(cl_uint));
cl_uint maxCUs;
error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId],
CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(cl_uint), &maxCUs, 0);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
wgs = 64;
const static cl_uint wavesPerCU = 8;
nWorkItems = maxCUs * wavesPerCU * wgs;
inputData = 0xabababab;
nIter = 1000;
program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
&error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed");
error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
NULL, NULL);
if (error_ != CL_SUCCESS) {
char programLog[1024];
_wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
printf("\n%s\n", programLog);
fflush(stdout);
}
CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
kernel_ = _wrapper->clCreateKernel(program_, "write_kernel", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
dstBuffer_ = _wrapper->clCreateBuffer(context_, CL_MEM_WRITE_ONLY, nBytes,
NULL, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer(dstBuffer) failed");
error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &dstBuffer_);
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
error_ =
_wrapper->clSetKernelArg(kernel_, 1, sizeof(cl_ulong), (void *)&loopCnt);
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(cl_uint),
(void *)&nWorkItems);
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
}
void OCLPerfDevMemWriteSpeed::run(void) {
if (skip_) {
return;
}
CPerfCounter timer;
size_t gws[1] = {nWorkItems};
size_t lws[1] = {wgs};
// warm up
error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
NULL, gws, lws, 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
_wrapper->clFinish(cmdQueues_[_deviceId]);
cl_uint *memResult;
memResult = (cl_uint *)malloc(nBytes);
if (0 == memResult) {
CHECK_RESULT_NO_RETURN(0, "malloc failed!\n");
return;
}
memset(memResult, 0, nBytes);
error_ =
_wrapper->clEnqueueReadBuffer(cmdQueues_[_deviceId], dstBuffer_, CL_FALSE,
0, nBytes, memResult, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueReadBuffer dstBuffer_ failed!");
_wrapper->clFinish(cmdQueues_[_deviceId]);
for (unsigned int i = 0; i < nBytes / sizeof(cl_uint); i++) {
if (((cl_uint *)memResult)[i] != inputData) {
CHECK_RESULT_NO_RETURN(0, "Data validation failed for warm up run!\n");
free(memResult);
return;
}
}
free(memResult);
timer.Reset();
timer.Start();
double sec2 = 0;
cl_event *events = new cl_event[nIter];
for (unsigned int i = 0; i < nIter; i++) {
error_ = _wrapper->clEnqueueNDRangeKernel(
cmdQueues_[_deviceId], kernel_, 1, NULL, gws, lws, 0, NULL, &events[i]);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
}
_wrapper->clFinish(cmdQueues_[_deviceId]);
timer.Stop();
for (unsigned int i = 0; i < nIter; i++) {
cl_ulong startTime = 0, endTime = 0;
error_ = _wrapper->clGetEventProfilingInfo(
events[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0);
CHECK_RESULT(error_, "clGetEventProfilingInfo failed");
error_ = _wrapper->clGetEventProfilingInfo(
events[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0);
CHECK_RESULT(error_, "clGetEventProfilingInfo failed");
_wrapper->clReleaseEvent(events[i]);
sec2 += endTime - startTime;
}
double sec = timer.GetElapsedTime();
delete[] events;
// write speed in GB/s
double perf = ((double)nBytes * nIter * (double)(1e-09)) / sec;
double perf2 = ((double)nBytes * nIter) / sec2;
_perfInfo = (float)perf2;
float perfInfo = (float)perf;
char buf[256];
SNPRINTF(buf, sizeof(buf), " (%8d bytes) i:%4d Wall time Perf: %.2f (GB/s)",
nBytes, nIter, perfInfo);
testDescString = buf;
}
unsigned int OCLPerfDevMemWriteSpeed::close(void) {
if (!skip_) {
if (dstBuffer_) {
error_ = _wrapper->clReleaseMemObject(dstBuffer_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseMemObject(srcBuffer_) failed");
}
}
return OCLTestImp::close();
}
@@ -0,0 +1,46 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_DevMemWriteSpeed_H_
#define _OCL_DevMemWriteSpeed_H_
#include "OCLTestImp.h"
class OCLPerfDevMemWriteSpeed : public OCLTestImp {
public:
OCLPerfDevMemWriteSpeed();
virtual ~OCLPerfDevMemWriteSpeed();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
cl_mem dstBuffer_;
unsigned int nWorkItems; // number of GPU work items
unsigned int wgs; // work group size
unsigned int nBytes; // output buffer size
unsigned int nIter; // overall number of timing loops
cl_uint inputData; // input data to fill the input buffer
bool skip_;
};
#endif // _OCL_DevMemWriteSpeed_H_
@@ -0,0 +1,480 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerfDeviceConcurrency.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include "CL/cl.h"
#include "Timer.h"
// Quiet pesky warnings
#ifdef WIN_OS
#define SNPRINTF sprintf_s
#else
#define SNPRINTF snprintf
#endif
typedef struct {
double x;
double y;
double width;
} coordRec;
static coordRec coords[] = {
{0.0, 0.0, 0.00001}, // All black
};
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
static const char *float_mandel_vec =
"__kernel void mandelbrot(__global uint *out, uint width, float xPos, "
"float yPos, float xStep, float yStep, uint maxIter)\n"
"{\n"
" int tid = get_global_id(0);\n"
" int i = tid % (width/4);\n"
" int j = tid / (width/4);\n"
" int4 veci = (int4)(4*i, 4*i+1, 4*i+2, 4*i+3);\n"
" int4 vecj = (int4)(j, j, j, j);\n"
" float4 x0;\n"
" x0.s0 = (float)(xPos + xStep*veci.s0);\n"
" x0.s1 = (float)(xPos + xStep*veci.s1);\n"
" x0.s2 = (float)(xPos + xStep*veci.s2);\n"
" x0.s3 = (float)(xPos + xStep*veci.s3);\n"
" float4 y0;\n"
" y0.s0 = (float)(yPos + yStep*vecj.s0);\n"
" y0.s1 = (float)(yPos + yStep*vecj.s1);\n"
" y0.s2 = (float)(yPos + yStep*vecj.s2);\n"
" y0.s3 = (float)(yPos + yStep*vecj.s3);\n"
"\n"
" float4 x = x0;\n"
" float4 y = y0;\n"
"\n"
" uint iter = 0;\n"
" float4 tmp;\n"
" int4 stay;\n"
" int4 ccount = 0;\n"
" float4 savx = x;\n"
" float4 savy = y;\n"
" stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n"
" for (iter = 0; (stay.s0 | stay.s1 | stay.s2 | stay.s3) && (iter < "
"maxIter); iter+=16)\n"
" {\n"
" x = savx;\n"
" y = savy;\n"
"\n"
" // Two iterations\n"
" tmp = x*x + x0 - y*y;\n"
" y = 2.0f * x * y + y0;\n"
" x = tmp*tmp + x0 - y*y;\n"
" y = 2.0f * tmp * y + y0;\n"
"\n"
" // Two iterations\n"
" tmp = x*x + x0 - y*y;\n"
" y = 2.0f * x * y + y0;\n"
" x = tmp*tmp + x0 - y*y;\n"
" y = 2.0f * tmp * y + y0;\n"
"\n"
" // Two iterations\n"
" tmp = x*x + x0 - y*y;\n"
" y = 2.0f * x * y + y0;\n"
" x = tmp*tmp + x0 - y*y;\n"
" y = 2.0f * tmp * y + y0;\n"
"\n"
" // Two iterations\n"
" tmp = x*x + x0 - y*y;\n"
" y = 2.0f * x * y + y0;\n"
" x = tmp*tmp + x0 - y*y;\n"
" y = 2.0f * tmp * y + y0;\n"
"\n"
" // Two iterations\n"
" tmp = x*x + x0 - y*y;\n"
" y = 2.0f * x * y + y0;\n"
" x = tmp*tmp + x0 - y*y;\n"
" y = 2.0f * tmp * y + y0;\n"
"\n"
" // Two iterations\n"
" tmp = x*x + x0 - y*y;\n"
" y = 2.0f * x * y + y0;\n"
" x = tmp*tmp + x0 - y*y;\n"
" y = 2.0f * tmp * y + y0;\n"
"\n"
" // Two iterations\n"
" tmp = x*x + x0 - y*y;\n"
" y = 2.0f * x * y + y0;\n"
" x = tmp*tmp + x0 - y*y;\n"
" y = 2.0f * tmp * y + y0;\n"
"\n"
" // Two iterations\n"
" tmp = x*x + x0 - y*y;\n"
" y = 2.0f * x * y + y0;\n"
" x = tmp*tmp + x0 - y*y;\n"
" y = 2.0f * tmp * y + y0;\n"
"\n"
" stay = (x*x+y*y) <= (float4)(4.0f, 4.0f, 4.0f, 4.0f);\n"
" savx = (stay ? x : savx);\n"
" savy = (stay ? y : savy);\n"
" ccount -= stay*16;\n"
" }\n"
" // Handle remainder\n"
" if (!(stay.s0 & stay.s1 & stay.s2 & stay.s3))\n"
" {\n"
" iter = 16;\n"
" do\n"
" {\n"
" x = savx;\n"
" y = savy;\n"
" // More efficient to use scalar ops here: Why?\n"
" stay.s0 = ((x.s0*x.s0+y.s0*y.s0) <= 4.0f) && (ccount.s0 < "
"maxIter);\n"
" stay.s1 = ((x.s1*x.s1+y.s1*y.s1) <= 4.0f) && (ccount.s1 < "
"maxIter);\n"
" stay.s2 = ((x.s2*x.s2+y.s2*y.s2) <= 4.0f) && (ccount.s2 < "
"maxIter);\n"
" stay.s3 = ((x.s3*x.s3+y.s3*y.s3) <= 4.0f) && (ccount.s3 < "
"maxIter);\n"
" tmp = x;\n"
" x = x*x + x0 - y*y;\n"
" y = 2.0f*tmp*y + y0;\n"
" ccount += stay;\n"
" iter--;\n"
" savx.s0 = (stay.s0 ? x.s0 : savx.s0);\n"
" savx.s1 = (stay.s1 ? x.s1 : savx.s1);\n"
" savx.s2 = (stay.s2 ? x.s2 : savx.s2);\n"
" savx.s3 = (stay.s3 ? x.s3 : savx.s3);\n"
" savy.s0 = (stay.s0 ? y.s0 : savy.s0);\n"
" savy.s1 = (stay.s1 ? y.s1 : savy.s1);\n"
" savy.s2 = (stay.s2 ? y.s2 : savy.s2);\n"
" savy.s3 = (stay.s3 ? y.s3 : savy.s3);\n"
" } while ((stay.s0 | stay.s1 | stay.s2 | stay.s3) && iter);\n"
" }\n"
" __global uint4 *vecOut = (__global uint4 *)out;\n"
" vecOut[tid] = convert_uint4(ccount);\n"
"}\n";
OCLPerfDeviceConcurrency::OCLPerfDeviceConcurrency() {
cl_uint numPlatforms;
cl_platform_id platform = NULL;
error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
if (0 < numPlatforms) {
cl_platform_id *platforms = new cl_platform_id[numPlatforms];
error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
platform = platforms[_platformIndex];
num_devices = 0;
/* Get the number of requested devices */
error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
&num_devices);
if (num_devices > MAX_DEVICES) {
num_devices = MAX_DEVICES;
}
delete platforms;
}
_numSubTests = num_devices;
}
OCLPerfDeviceConcurrency::~OCLPerfDeviceConcurrency() {}
void OCLPerfDeviceConcurrency::setData(cl_mem buffer, unsigned int idx,
unsigned int val) {
unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
cmd_queue_[idx], buffer, true, CL_MAP_WRITE, 0, bufSize_, 0, NULL, NULL,
&error_);
for (unsigned int i = 0; i < width_; i++) data[i] = val;
error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_[idx], buffer, data, 0,
NULL, NULL);
_wrapper->clFinish(cmd_queue_[idx]);
}
void OCLPerfDeviceConcurrency::checkData(cl_mem buffer, unsigned int idx) {
unsigned int *data = (unsigned int *)_wrapper->clEnqueueMapBuffer(
cmd_queue_[idx], buffer, true, CL_MAP_READ, 0, bufSize_, 0, NULL, NULL,
&error_);
totalIters = 0;
for (unsigned int i = 0; i < width_; i++) {
totalIters += data[i];
}
error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_[idx], buffer, data, 0,
NULL, NULL);
_wrapper->clFinish(cmd_queue_[idx]);
}
static void CL_CALLBACK notify_callback(const char *errinfo,
const void *private_info, size_t cb,
void *user_data) {}
void OCLPerfDeviceConcurrency::open(unsigned int test, char *units,
double &conversion, unsigned int deviceId) {
cl_uint numPlatforms;
cl_platform_id platform = NULL;
num_devices = 0;
cl_device_id *devices = NULL;
unsigned int i;
_crcword = 0;
conversion = 1.0f;
_deviceId = deviceId;
_openTest = test;
context_ = 0;
for (i = 0; i < MAX_DEVICES; i++) {
cmd_queue_[i] = 0;
program_[i] = 0;
kernel_[i] = 0;
outBuffer_[i] = 0;
}
// Maximum iteration count
// NOTE: Some kernels are unrolled 16 times, so make sure maxIter is divisible
// by 16 NOTE: Can increase to get better peak performance numbers, but be
// sure not to TDR slow ASICs! NOTE:. for warmup run we use maxIter = 256 and
// then for the actual run we use maxIter = 8388608 * (engine_clock / 1000).
maxIter = 256;
// NOTE: Width needs to be divisible by 4 because the float_mandel_vec kernel
// processes 4 pixels at once NOTE: Can increase to get better peak
// performance numbers, but be sure not to TDR slow ASICs!
width_ = 256;
// We compute a square domain
bufSize_ = width_ * sizeof(cl_uint);
error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
if (0 < numPlatforms) {
cl_platform_id *platforms = new cl_platform_id[numPlatforms];
error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
platform = platforms[_platformIndex];
char pbuf[100];
error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
NULL);
num_devices = 0;
/* Get the number of requested devices */
error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
&num_devices);
if (num_devices > MAX_DEVICES) {
num_devices = MAX_DEVICES;
}
delete platforms;
}
/*
* If we could find our platform, use it. If not, die as we need the AMD
* platform for these extensions.
*/
CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
CHECK_RESULT(devices == 0, "no devices");
/* Get the requested devices */
error_ =
_wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
context_ = _wrapper->clCreateContext(NULL, num_devices, devices,
notify_callback, NULL, &error_);
CHECK_RESULT(context_ == 0, "clCreateContext failed");
cur_devices = _openTest + 1;
for (i = 0; i < cur_devices; i++) {
cmd_queue_[i] =
_wrapper->clCreateCommandQueue(context_, devices[i], 0, NULL);
CHECK_RESULT(cmd_queue_[i] == 0, "clCreateCommandQueue failed");
outBuffer_[i] =
_wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
CHECK_RESULT(outBuffer_[i] == 0, "clCreateBuffer(outBuffer) failed");
}
const char *tmp;
tmp = float_mandel_vec;
for (i = 0; i < cur_devices; i++) {
program_[i] = _wrapper->clCreateProgramWithSource(
context_, 1, (const char **)&tmp, NULL, &error_);
CHECK_RESULT(program_[i] == 0, "clCreateProgramWithSource failed");
error_ =
_wrapper->clBuildProgram(program_[i], 1, &devices[i], "", NULL, NULL);
if (error_ != CL_SUCCESS) {
cl_int intError;
char log[16384];
intError = _wrapper->clGetProgramBuildInfo(
program_[i], devices[i], CL_PROGRAM_BUILD_LOG, 16384 * sizeof(char),
log, NULL);
printf("Build error on device %d -> %s\n", i, log);
CHECK_RESULT(0, "clBuildProgram failed");
}
}
for (i = 0; i < cur_devices; i++) {
kernel_[i] = _wrapper->clCreateKernel(program_[i], "mandelbrot", &error_);
CHECK_RESULT(kernel_[i] == 0, "clCreateKernel failed");
}
coordIdx = _openTest % numCoords;
float xStep = (float)(coords[coordIdx].width / (double)width_);
float yStep = (float)(-coords[coordIdx].width / (double)width_);
float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
for (i = 0; i < cur_devices; i++) {
error_ = _wrapper->clSetKernelArg(kernel_[i], 0, sizeof(cl_mem),
(void *)&outBuffer_[i]);
error_ = _wrapper->clSetKernelArg(kernel_[i], 1, sizeof(cl_uint),
(void *)&width_);
error_ = _wrapper->clSetKernelArg(kernel_[i], 2, sizeof(cl_float),
(void *)&xPos);
error_ = _wrapper->clSetKernelArg(kernel_[i], 3, sizeof(cl_float),
(void *)&yPos);
error_ = _wrapper->clSetKernelArg(kernel_[i], 4, sizeof(cl_float),
(void *)&xStep);
error_ = _wrapper->clSetKernelArg(kernel_[i], 5, sizeof(cl_float),
(void *)&yStep);
error_ = _wrapper->clSetKernelArg(kernel_[i], 6, sizeof(cl_uint),
(void *)&maxIter);
}
for (i = 0; i < cur_devices; i++) {
setData(outBuffer_[i], i, 0xdeadbeef);
}
cl_uint clkFrequency = 0;
error_ = clGetDeviceInfo(devices[0], CL_DEVICE_MAX_CLOCK_FREQUENCY,
sizeof(clkFrequency), &clkFrequency, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
assert(clkFrequency > 0);
maxIter = (unsigned int)(8388608 * ((float)clkFrequency / 1000));
maxIter = (maxIter + 15) & ~15;
}
void OCLPerfDeviceConcurrency::run(void) {
int global = width_ >> 2;
// We handle 4 pixels per thread
int local = 64;
size_t global_work_size[1] = {(size_t)global};
size_t local_work_size[1] = {(size_t)local};
unsigned int i;
// Warmup
for (i = 0; i < cur_devices; i++) {
error_ = _wrapper->clEnqueueNDRangeKernel(
cmd_queue_[i], kernel_[i], 1, NULL, (const size_t *)global_work_size,
(const size_t *)local_work_size, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
}
for (i = 0; i < cur_devices; i++) {
_wrapper->clFlush(cmd_queue_[i]);
}
for (i = 0; i < cur_devices; i++) {
_wrapper->clFinish(cmd_queue_[i]);
}
for (i = 0; i < cur_devices; i++) {
error_ = _wrapper->clSetKernelArg(kernel_[i], 6, sizeof(cl_uint),
(void *)&maxIter);
}
CPerfCounter timer;
timer.Reset();
timer.Start();
for (i = 0; i < cur_devices; i++) {
error_ = _wrapper->clEnqueueNDRangeKernel(
cmd_queue_[i], kernel_[i], 1, NULL, (const size_t *)global_work_size,
(const size_t *)local_work_size, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
}
for (i = 0; i < cur_devices; i++) {
_wrapper->clFlush(cmd_queue_[i]);
}
for (i = 0; i < cur_devices; i++) {
_wrapper->clFinish(cmd_queue_[i]);
}
timer.Stop();
double sec = timer.GetElapsedTime();
unsigned long long expected =
(unsigned long long)width_ * (unsigned long long)maxIter;
for (i = 0; i < cur_devices; i++) {
checkData(outBuffer_[i], i);
CHECK_RESULT(totalIters != expected, "Incorrect iteration count detected!");
}
_perfInfo = (float)sec;
char buf[128];
SNPRINTF(buf, sizeof(buf), "time for %2d devices (s) (%2d queues) ",
cur_devices, cur_devices);
testDescString = buf;
}
unsigned int OCLPerfDeviceConcurrency::close(void) {
unsigned int i;
for (i = 0; i < cur_devices; i++) {
error_ = _wrapper->clReleaseMemObject(outBuffer_[i]);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseMemObject(outBuffer_) failed");
}
for (i = 0; i < cur_devices; i++) {
error_ = _wrapper->clReleaseKernel(kernel_[i]);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseKernel(kernel_) failed");
}
for (i = 0; i < cur_devices; i++) {
error_ = _wrapper->clReleaseProgram(program_[i]);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseProgram(program_) failed");
}
for (i = 0; i < cur_devices; i++) {
error_ = _wrapper->clReleaseCommandQueue(cmd_queue_[i]);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseCommandQueue failed");
}
if (context_) {
error_ = _wrapper->clReleaseContext(context_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
}
return _crcword;
}
@@ -0,0 +1,60 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_Perf_DeviceConcurrency_H_
#define _OCL_Perf_DeviceConcurrency_H_
#include "OCLTestImp.h"
class OCLPerfDeviceConcurrency : public OCLTestImp {
public:
OCLPerfDeviceConcurrency();
virtual ~OCLPerfDeviceConcurrency();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
std::string shader_;
void setData(cl_mem buffer, unsigned int idx, unsigned int data);
void checkData(cl_mem buffer, unsigned int idx);
#define MAX_DEVICES 16
cl_context context_;
cl_command_queue cmd_queue_[MAX_DEVICES];
cl_program program_[MAX_DEVICES];
cl_kernel kernel_[MAX_DEVICES];
cl_mem outBuffer_[MAX_DEVICES];
cl_int error_;
cl_uint num_devices;
cl_uint cur_devices;
unsigned int width_;
unsigned int bufSize_;
unsigned int maxIter;
unsigned int coordIdx;
unsigned long long totalIters;
};
#endif // _OCL_Perf_DeviceConcurrency_H_
@@ -0,0 +1,227 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerfDeviceEnqueue.h"
#include <Timer.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include "CL/cl.h"
// Quiet pesky warnings
#ifdef WIN_OS
#define SNPRINTF sprintf_s
#else
#define SNPRINTF snprintf
#endif
#define KERNEL_CODE(...) #__VA_ARGS__
typedef struct {
unsigned int threads;
} testStruct;
static testStruct testList[] = {
{64}, {128}, {256}, {512}, {1024}, {2048}, {4096},
};
const static char* strKernel = {KERNEL_CODE(
\n __kernel void childKernel(__global uint* buf) {
int idx = get_global_id(0);
if (idx < 0) {
buf[idx] = 0;
}
}
\n __kernel void parentKernel(__global uint* buf) {
queue_t def_q = get_default_queue();
ndrange_t ndrange = ndrange_1D(64, 64);
int gid = get_global_id(0);
int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, ^{
childKernel(buf);
});
}
\n)};
OCLPerfDeviceEnqueue::OCLPerfDeviceEnqueue() {
testListSize = sizeof(testList) / sizeof(testStruct);
_numSubTests = 7 * testListSize;
deviceQueue_ = NULL;
failed_ = false;
kernel2_ = NULL;
}
OCLPerfDeviceEnqueue::~OCLPerfDeviceEnqueue() {}
void OCLPerfDeviceEnqueue::open(unsigned int test, char* units,
double& conversion, unsigned int deviceId) {
if (type_ == CL_DEVICE_TYPE_CPU) {
return;
}
OCLTestImp::open(test, units, conversion, deviceId);
CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
testID_ = test;
threads = testList[testID_ % testListSize].threads;
size_t param_size = 0;
char* strVersion = 0;
error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
0, &param_size);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
strVersion = new char[param_size];
error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
param_size, strVersion, 0);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
if (strVersion[7] < '2') {
failed_ = true;
return;
}
delete strVersion;
cl_uint maxDevQSize = 0;
#if defined(CL_VERSION_2_0)
error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId],
CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE,
sizeof(cl_uint), &maxDevQSize, 0);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
#endif
program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
&error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed");
error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
"-cl-std=CL2.0", NULL, NULL);
if (error_ != CL_SUCCESS) {
char programLog[1024];
_wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
printf("\n%s\n", programLog);
fflush(stdout);
}
CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
kernel_ = _wrapper->clCreateKernel(program_, "parentKernel", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
kernel2_ = _wrapper->clCreateKernel(program_, "childKernel", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
cl_mem buffer;
buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR, 2048, NULL,
&error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
buffers_.push_back(buffer);
// Hardcoded for us
if (testID_ >= testListSize) {
queueSize = (1 << (testID_ / testListSize)) * 256 * 1024;
queueSize = std::min(queueSize, maxDevQSize);
threads *= (1 << (testID_ / testListSize - 1));
threads = std::min(threads, queueSize / 128);
} else {
queueSize = std::max((cl_uint)threads * 128, (cl_uint)16384);
}
#if defined(CL_VERSION_2_0)
const cl_queue_properties cprops[] = {
CL_QUEUE_PROPERTIES,
static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
CL_QUEUE_ON_DEVICE_DEFAULT |
CL_QUEUE_ON_DEVICE),
CL_QUEUE_SIZE, queueSize, 0};
deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties(
context_, devices_[deviceId], cprops, &error_);
CHECK_RESULT((error_ != CL_SUCCESS),
"clCreateCommandQueueWithProperties() failed");
#endif
}
static void CL_CALLBACK notify_callback(const char* errinfo,
const void* private_info, size_t cb,
void* user_data) {}
void OCLPerfDeviceEnqueue::run(void) {
CPerfCounter timer;
if (type_ == CL_DEVICE_TYPE_CPU) {
return;
}
if (failed_) return;
cl_mem buffer = buffers()[0];
size_t gws[1] = {threads};
size_t lws[1] = {64};
if (gws[0] >= 256) {
lws[0] = 256;
}
error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
NULL, gws, lws, 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
_wrapper->clFinish(cmdQueues_[_deviceId]);
// Try to normalize the amount of work per test
unsigned int repeats = (64 / threads) * 50;
if (repeats == 0) repeats = 1;
timer.Reset();
timer.Start();
for (unsigned int i = 0; i < repeats; i++) {
error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
NULL, gws, lws, 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
_wrapper->clFinish(cmdQueues_[_deviceId]);
}
timer.Stop();
double sec = timer.GetElapsedTime();
_perfInfo = (float)(threads * repeats) / (float)(sec * 1000000.);
char buf[256];
SNPRINTF(buf, sizeof(buf),
"%7d threads spawning 64 threads, queue size %5dKB (Mdisp/s)",
threads, queueSize / 1024);
testDescString = buf;
}
unsigned int OCLPerfDeviceEnqueue::close(void) {
// FIXME: Re-enable CPU test once bug 10143 is fixed.
if (type_ == CL_DEVICE_TYPE_CPU) {
return 0;
}
if (NULL != deviceQueue_) {
_wrapper->clReleaseCommandQueue(deviceQueue_);
}
if (NULL != kernel2_) {
_wrapper->clReleaseKernel(kernel2_);
}
return OCLTestImp::close();
}
@@ -0,0 +1,47 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCLPERF_DEVICE_ENQUEUE_H_
#define _OCLPERF_DEVICE_ENQUEUE_H_
#include "OCLTestImp.h"
class OCLPerfDeviceEnqueue : public OCLTestImp {
public:
OCLPerfDeviceEnqueue();
virtual ~OCLPerfDeviceEnqueue();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
private:
cl_command_queue deviceQueue_;
bool failed_;
unsigned int testID_;
cl_kernel kernel2_;
unsigned int testListSize;
unsigned int threads;
cl_uint queueSize;
};
#endif // _OCLPERF_DEVICE_ENQUEUE_H_
@@ -0,0 +1,260 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerfDeviceEnqueue2.h"
#include <Timer.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include "CL/cl.h"
// Quiet pesky warnings
#ifdef WIN_OS
#define SNPRINTF sprintf_s
#else
#define SNPRINTF snprintf
#endif
#define KERNEL_CODE(...) #__VA_ARGS__
typedef struct {
unsigned int threads;
} testStruct;
static testStruct testList[] = {
{64}, {128}, {256}, {512}, {1024}, {2048}, {4096},
};
static unsigned int qsizeList[] = {
16, 32, 64, 128, 256, 512,
};
static unsigned int levelList[] = {
1,
2,
4,
8,
};
const static char* strKernel = {KERNEL_CODE(
\n __kernel void childKernel(__global uint* buf, uint level) {
if (level) {
queue_t def_q = get_default_queue();
ndrange_t ndrange = ndrange_1D(64, 64);
int gid = get_global_id(0);
int lid = get_local_id(0);
if (lid == 0) {
int enq_res =
enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, ^{
childKernel(buf, level - 1);
});
}
} else {
int idx = get_global_id(0);
if (idx < 0) {
buf[idx] = 0;
}
}
}
\n __kernel void parentKernel(__global uint* buf, uint level) {
queue_t def_q = get_default_queue();
ndrange_t ndrange = ndrange_1D(64, 64);
int gid = get_global_id(0);
if (level) {
int enq_res =
enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, ^{
childKernel(buf, level - 1);
});
}
}
\n)};
OCLPerfDeviceEnqueue2::OCLPerfDeviceEnqueue2() {
subTests_level = sizeof(levelList) / sizeof(unsigned int);
subTests_qsize = (sizeof(qsizeList) / sizeof(unsigned int));
subTests_thread = sizeof(testList) / sizeof(testStruct);
testListSize = subTests_thread;
_numSubTests = subTests_level * subTests_qsize * subTests_thread;
deviceQueue_ = NULL;
failed_ = false;
kernel2_ = NULL;
level = 2;
skip_ = false;
}
OCLPerfDeviceEnqueue2::~OCLPerfDeviceEnqueue2() {}
void OCLPerfDeviceEnqueue2::open(unsigned int test, char* units,
double& conversion, unsigned int deviceId) {
if (type_ == CL_DEVICE_TYPE_CPU) {
return;
}
OCLTestImp::open(test, units, conversion, deviceId);
CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
testID_ = test;
threads = testList[testID_ / (subTests_qsize * subTests_level)].threads;
queueSize = qsizeList[(testID_ / subTests_level) % subTests_qsize] * 1024;
level = levelList[testID_ % subTests_level];
size_t param_size = 0;
char* strVersion = 0;
error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
0, &param_size);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
strVersion = new char[param_size];
error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
param_size, strVersion, 0);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
if (strVersion[7] < '2') {
failed_ = true;
return;
}
delete strVersion;
program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
&error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed");
error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
"-cl-std=CL2.0", NULL, NULL);
if (error_ != CL_SUCCESS) {
char programLog[1024];
_wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
printf("\n%s\n", programLog);
fflush(stdout);
}
CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
kernel_ = _wrapper->clCreateKernel(program_, "parentKernel", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
kernel2_ = _wrapper->clCreateKernel(program_, "childKernel", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
cl_mem buffer;
buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR, 2048, NULL,
&error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
buffers_.push_back(buffer);
#if defined(CL_VERSION_2_0)
const cl_queue_properties cprops[] = {
CL_QUEUE_PROPERTIES,
static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
CL_QUEUE_ON_DEVICE_DEFAULT |
CL_QUEUE_ON_DEVICE),
CL_QUEUE_SIZE, queueSize, 0};
deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties(
context_, devices_[deviceId], cprops, &error_);
CHECK_RESULT((error_ != CL_SUCCESS),
"clCreateCommandQueueWithProperties() failed");
#else
skip_ = true;
testDescString =
"DeviceEnqueue NOT supported for < 2.0 builds. Test Skipped.";
return;
#endif
}
static void CL_CALLBACK notify_callback(const char* errinfo,
const void* private_info, size_t cb,
void* user_data) {}
void OCLPerfDeviceEnqueue2::run(void) {
CPerfCounter timer;
if (type_ == CL_DEVICE_TYPE_CPU) {
return;
}
if (failed_) {
return;
}
if (skip_) {
return;
}
cl_mem buffer = buffers()[0];
size_t gws[1] = {threads};
size_t lws[1] = {64};
error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(unsigned int), &level);
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
NULL, gws, lws, 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
_wrapper->clFinish(cmdQueues_[_deviceId]);
// Try to normalize the amount of work per test
// unsigned int repeats = (4096 / threads) * 100 ;
unsigned int repeats = (4096 / threads) * 10;
// unsigned int repeats = 100;
timer.Reset();
timer.Start();
for (unsigned int i = 0; i < repeats; i++) {
error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
NULL, gws, lws, 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
_wrapper->clFinish(cmdQueues_[_deviceId]);
}
timer.Stop();
double sec = timer.GetElapsedTime();
_perfInfo = (float)(threads * repeats * level) / (float)(sec * 1000000.);
char buf[256];
SNPRINTF(
buf, sizeof(buf),
"%5d threads spawning 64 threads, queue size %3dKB (Mdisp/s), level=%2d",
threads, queueSize / 1024, level);
testDescString = buf;
}
unsigned int OCLPerfDeviceEnqueue2::close(void) {
// FIXME: Re-enable CPU test once bug 10143 is fixed.
if (type_ == CL_DEVICE_TYPE_CPU) {
return 0;
}
if (deviceQueue_) {
error_ = _wrapper->clReleaseCommandQueue(deviceQueue_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseCommandQueue failed");
}
if (kernel2_) {
error_ = _wrapper->clReleaseKernel(kernel2_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
}
return OCLTestImp::close();
}
@@ -0,0 +1,54 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCLPERF_DEVICE_ENQUEUE2_H_
#define _OCLPERF_DEVICE_ENQUEUE2_H_
#include "OCLTestImp.h"
class OCLPerfDeviceEnqueue2 : public OCLTestImp {
public:
OCLPerfDeviceEnqueue2();
virtual ~OCLPerfDeviceEnqueue2();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
private:
cl_command_queue deviceQueue_;
unsigned int testID_;
cl_kernel kernel2_;
unsigned int testListSize;
unsigned int threads;
cl_uint queueSize;
unsigned int subTests_level;
unsigned int subTests_qsize;
unsigned int subTests_thread;
unsigned int level;
unsigned int lws_value;
bool failed_;
bool skip_;
};
#endif // _OCLPERF_DEVICE_ENQUEUE2_H_
@@ -0,0 +1,267 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerfDeviceEnqueueEvent.h"
#include <Timer.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include "CL/cl.h"
// Quiet pesky warnings
#ifdef WIN_OS
#define SNPRINTF sprintf_s
#else
#define SNPRINTF snprintf
#endif
#define KERNEL_CODE(...) #__VA_ARGS__
typedef struct {
unsigned int threads;
} testStruct;
static testStruct testList[] = {
{64}, {128}, {256}, {512}, {1024}, {2048}, {4096},
};
static unsigned int qsizeList[] = {
16, 32, 64, 128, 256, 512,
};
static unsigned int levelList[] = {
1,
2,
4,
8,
};
const static char* strKernel = {KERNEL_CODE(
\n __kernel void childKernel(__global uint* buf, uint level,
clk_event_t wait_evt) {
int idx = get_global_id(0);
if (idx < 0) {
buf[idx] = 0;
}
}
\n __kernel void parentKernel(__global uint* buf, uint level) {
if (level) {
queue_t def_q = get_default_queue();
ndrange_t ndrange = ndrange_1D(64, 64);
clk_event_t user_evt = create_user_event();
clk_event_t block_evt, wait_evt;
wait_evt = user_evt;
for (uint i = 0; i < level; i++) {
int enq_res = enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, 0,
/*&user_evt*/ NULL, &block_evt, ^{
childKernel(buf, level - 1, block_evt);
});
// wait_evt = block_evt;
}
if (is_valid_event(user_evt)) {
set_user_event_status(user_evt, CL_COMPLETE);
release_event(user_evt);
}
} else {
int idx = get_global_id(0);
if (idx < 0) {
buf[idx] = 0;
}
}
}
\n)};
OCLPerfDeviceEnqueueEvent::OCLPerfDeviceEnqueueEvent() {
subTests_level = sizeof(levelList) / sizeof(unsigned int);
subTests_qsize = (sizeof(qsizeList) / sizeof(unsigned int));
subTests_thread = sizeof(testList) / sizeof(testStruct);
testListSize = subTests_thread;
//_numSubTests = 2*testListSize + subTests_level + subTests_qsize;
_numSubTests = subTests_level * subTests_qsize * subTests_thread;
deviceQueue_ = NULL;
failed_ = false;
skip_ = false;
kernel2_ = NULL;
level = 2;
}
OCLPerfDeviceEnqueueEvent::~OCLPerfDeviceEnqueueEvent() {}
void OCLPerfDeviceEnqueueEvent::open(unsigned int test, char* units,
double& conversion,
unsigned int deviceId) {
if (type_ == CL_DEVICE_TYPE_CPU) {
return;
}
OCLTestImp::open(test, units, conversion, deviceId);
CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
testID_ = test;
threads = testList[testID_ / (subTests_qsize * subTests_level)].threads;
queueSize = qsizeList[(testID_ / subTests_level) % subTests_qsize] * 1024;
level = levelList[testID_ % subTests_level];
lws_value = 64;
size_t param_size = 0;
char* strVersion = 0;
error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
0, &param_size);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
strVersion = new char[param_size];
error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
param_size, strVersion, 0);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
if (strVersion[7] < '2') {
failed_ = true;
return;
}
delete strVersion;
program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
&error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed");
error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
"-cl-std=CL2.0", NULL, NULL);
if (error_ != CL_SUCCESS) {
char programLog[1024];
_wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
printf("\n%s\n", programLog);
fflush(stdout);
}
CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
kernel_ = _wrapper->clCreateKernel(program_, "parentKernel", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
kernel2_ = _wrapper->clCreateKernel(program_, "childKernel", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
cl_mem buffer;
buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR, 2048, NULL,
&error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
buffers_.push_back(buffer);
#if defined(CL_VERSION_2_0)
const cl_queue_properties cprops[] = {
CL_QUEUE_PROPERTIES,
static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
CL_QUEUE_ON_DEVICE_DEFAULT |
CL_QUEUE_ON_DEVICE),
CL_QUEUE_SIZE, queueSize, 0};
deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties(
context_, devices_[deviceId], cprops, &error_);
CHECK_RESULT((error_ != CL_SUCCESS),
"clCreateCommandQueueWithProperties() failed");
#else
skip_ = true;
testDescString =
"DeviceEnqueue NOT supported for < 2.0 builds. Test Skipped.";
return;
#endif
}
static void CL_CALLBACK notify_callback(const char* errinfo,
const void* private_info, size_t cb,
void* user_data) {}
void OCLPerfDeviceEnqueueEvent::run(void) {
CPerfCounter timer;
if (type_ == CL_DEVICE_TYPE_CPU) {
return;
}
if (failed_) {
return;
}
if (skip_) {
return;
}
cl_mem buffer = buffers()[0];
size_t gws[1] = {threads};
size_t lws[1] = {lws_value};
error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(unsigned int), &level);
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
NULL, gws, lws, 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
_wrapper->clFinish(cmdQueues_[_deviceId]);
// Try to normalize the amount of work per test
// unsigned int repeats = (4096 / threads) * 100 ;
unsigned int repeats = (4096 / threads) * 10;
// unsigned int repeats = 100;
timer.Reset();
timer.Start();
for (unsigned int i = 0; i < repeats; i++) {
error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
NULL, gws, lws, 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
_wrapper->clFinish(cmdQueues_[_deviceId]);
}
timer.Stop();
double sec = timer.GetElapsedTime();
_perfInfo = (float)(threads * repeats * level) / (float)(sec * 1000000.);
char buf[256];
SNPRINTF(
buf, sizeof(buf),
"%5d threads spawning %2d threads, queue size %3dKB (Mdisp/s), level=%2d",
threads, lws_value, queueSize / 1024, level);
testDescString = buf;
}
unsigned int OCLPerfDeviceEnqueueEvent::close(void) {
// FIXME: Re-enable CPU test once bug 10143 is fixed.
if (type_ == CL_DEVICE_TYPE_CPU) {
return 0;
}
if (deviceQueue_) {
error_ = _wrapper->clReleaseCommandQueue(deviceQueue_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseCommandQueue failed");
}
if (kernel2_) {
error_ = _wrapper->clReleaseKernel(kernel2_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
}
return OCLTestImp::close();
}
@@ -0,0 +1,54 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCLPERF_DEVICE_ENQUEUE_EVENT_H_
#define _OCLPERF_DEVICE_ENQUEUE_EVENT_H_
#include "OCLTestImp.h"
class OCLPerfDeviceEnqueueEvent : public OCLTestImp {
public:
OCLPerfDeviceEnqueueEvent();
virtual ~OCLPerfDeviceEnqueueEvent();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
private:
cl_command_queue deviceQueue_;
unsigned int testID_;
cl_kernel kernel2_;
unsigned int testListSize;
unsigned int threads;
cl_uint queueSize;
unsigned int subTests_level;
unsigned int subTests_qsize;
unsigned int subTests_thread;
unsigned int level;
unsigned int lws_value;
bool failed_;
bool skip_;
};
#endif // _OCLPERF_DEVICE_ENQUEUE_EVENT_H_
@@ -0,0 +1,233 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerfDeviceEnqueueSier.h"
#include <Timer.h>
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
#include "CL/cl.h"
// Quiet pesky warnings
#ifdef WIN_OS
#define SNPRINTF sprintf_s
#else
#define SNPRINTF snprintf
#endif
#define KERNEL_CODE(...) #__VA_ARGS__
typedef struct {
unsigned int threads;
} testStruct;
static unsigned int sizeList[] = {
81, 243, 729, 2187, 6561, 19683, 59049,
};
const static char* strKernel = {KERNEL_CODE(
\n __kernel void parentKernel(__global uint* buf, int width, int offsetx,
int offsety) {
int x = get_global_id(0);
int y = get_global_id(1);
queue_t q = get_default_queue();
int one_third = get_global_size(0) / 3;
int two_thirds = 2 * one_third;
if (x >= one_third && x < two_thirds && y >= one_third && y < two_thirds) {
int idx = get_global_id(0);
if (idx < 0) {
buf[idx] = 0;
}
} else {
if (one_third > 1 && x % one_third == 0 && y % one_third == 0) {
const size_t grid[2] = {one_third, one_third};
enqueue_kernel(q, 0, ndrange_2D(grid), ^{
parentKernel(buf, width, x + offsetx, y + offsety);
});
}
}
}
\n)};
OCLPerfDeviceEnqueueSier::OCLPerfDeviceEnqueueSier() {
_numSubTests = sizeof(sizeList) / sizeof(unsigned int);
deviceQueue_ = NULL;
failed_ = false;
skip_ = false;
}
OCLPerfDeviceEnqueueSier::~OCLPerfDeviceEnqueueSier() {}
void OCLPerfDeviceEnqueueSier::open(unsigned int test, char* units,
double& conversion, unsigned int deviceId) {
if (type_ == CL_DEVICE_TYPE_CPU) {
return;
}
OCLTestImp::open(test, units, conversion, deviceId);
CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
testID_ = test;
size_t param_size = 0;
char* strVersion = 0;
error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION, 0,
0, &param_size);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
strVersion = new char[param_size];
error_ = _wrapper->clGetDeviceInfo(devices_[_deviceId], CL_DEVICE_VERSION,
param_size, strVersion, 0);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed");
if (strVersion[7] < '2') {
failed_ = true;
return;
}
delete strVersion;
program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
&error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed");
error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
"-cl-std=CL2.0", NULL, NULL);
if (error_ != CL_SUCCESS) {
char programLog[1024];
_wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
printf("\n%s\n", programLog);
fflush(stdout);
}
CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
kernel_ = _wrapper->clCreateKernel(program_, "parentKernel", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
cl_mem buffer;
buffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR, 2048, NULL,
&error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
buffers_.push_back(buffer);
queueSize = 512 * 1024;
image_size = sizeList[testID_];
#if defined(CL_VERSION_2_0)
const cl_queue_properties cprops[] = {
CL_QUEUE_PROPERTIES,
static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
CL_QUEUE_ON_DEVICE_DEFAULT |
CL_QUEUE_ON_DEVICE),
CL_QUEUE_SIZE, queueSize, 0};
deviceQueue_ = _wrapper->clCreateCommandQueueWithProperties(
context_, devices_[deviceId], cprops, &error_);
CHECK_RESULT((error_ != CL_SUCCESS),
"clCreateCommandQueueWithProperties() failed");
#else
skip_ = true;
testDescString =
"DeviceEnqueue NOT supported for < 2.0 builds. Test Skipped.";
return;
#endif
}
static void CL_CALLBACK notify_callback(const char* errinfo,
const void* private_info, size_t cb,
void* user_data) {}
void OCLPerfDeviceEnqueueSier::run(void) {
CPerfCounter timer;
if (type_ == CL_DEVICE_TYPE_CPU) {
return;
}
if (failed_) {
return;
}
if (skip_) {
return;
}
cl_mem buffer = buffers()[0];
size_t gws[1] = {1};
size_t lws[1] = {0};
error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), &buffer);
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
int width = image_size, offsetx = 0, offsety = 0;
error_ |= _wrapper->clSetKernelArg(kernel_, 1, sizeof(int), (void*)&width);
error_ |= _wrapper->clSetKernelArg(kernel_, 2, sizeof(int), (void*)&offsetx);
error_ |= _wrapper->clSetKernelArg(kernel_, 3, sizeof(int), (void*)&offsety);
CHECK_RESULT((error_ != CL_SUCCESS), "clSetKernelArg() failed");
error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 1,
NULL, gws, 0, 0, NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
_wrapper->clFinish(cmdQueues_[_deviceId]);
size_t global_work_size[2] = {image_size, image_size};
// Try to normalize the amount of work per test
unsigned int repeats = 100;
timer.Reset();
timer.Start();
for (unsigned int i = 0; i < repeats; i++) {
error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_, 2,
NULL, global_work_size, 0, 0,
NULL, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel() failed");
_wrapper->clFinish(cmdQueues_[_deviceId]);
}
timer.Stop();
double sec = timer.GetElapsedTime();
unsigned int numOfKernels = (int)pow(8.0, log(image_size) / log(3) - 1);
_perfInfo = (float)(numOfKernels * repeats) / (float)(sec * 1000000.);
char buf[256];
SNPRINTF(buf, sizeof(buf), "image_size = %5d, queue size %3dKB (Mdisp/s)",
image_size, queueSize / 1024);
testDescString = buf;
}
unsigned int OCLPerfDeviceEnqueueSier::close(void) {
// FIXME: Re-enable CPU test once bug 10143 is fixed.
if (type_ == CL_DEVICE_TYPE_CPU) {
return 0;
}
if (deviceQueue_) {
error_ = _wrapper->clReleaseCommandQueue(deviceQueue_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseCommandQueue failed");
}
return OCLTestImp::close();
}
@@ -0,0 +1,49 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCLPERF_DEVICE_ENQUEUE_SIER_H_
#define _OCLPERF_DEVICE_ENQUEUE_SIER_H_
#include "OCLTestImp.h"
class OCLPerfDeviceEnqueueSier : public OCLTestImp {
public:
OCLPerfDeviceEnqueueSier();
virtual ~OCLPerfDeviceEnqueueSier();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
private:
cl_command_queue deviceQueue_;
unsigned int testID_;
unsigned int testListSize;
// unsigned int threads;
cl_uint queueSize;
unsigned int image_size;
bool failed_;
bool skip_;
};
#endif // _OCLPERF_DEVICE_ENQUEUE_SIER_H_
@@ -0,0 +1,391 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerfDispatchSpeed.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include "CL/cl.h"
#include "Timer.h"
// Quiet pesky warnings
#ifdef WIN_OS
#define SNPRINTF sprintf_s
#else
#define SNPRINTF snprintf
#endif
#define CHAR_BUF_SIZE 512
typedef struct {
unsigned int iterations;
int flushEvery;
} testStruct;
testStruct testList[] = {
{1, -1}, {1, -1}, {10, 1}, {10, -1}, {100, 1},
{100, 10}, {100, -1}, {1000, 1}, {1000, 10}, {1000, 100},
{1000, -1}, {10000, 1}, {10000, 10}, {10000, 100}, {10000, 1000},
{10000, -1}, {100000, 1}, {100000, 10}, {100000, 100}, {100000, 1000},
{100000, 10000}, {100000, -1},
};
unsigned int mapTestList[] = {1, 1, 10, 100, 1000, 10000, 100000};
void OCLPerfDispatchSpeed::genShader(void) {
shader_.clear();
shader_ +=
"__kernel void _dispatchSpeed(__global float *outBuf)\n"
"{\n"
" int i = (int) get_global_id(0);\n"
" if (i < 0)\n"
" outBuf[i] = 0.0f;\n"
"}\n";
}
OCLPerfDispatchSpeed::OCLPerfDispatchSpeed() {
testListSize = sizeof(testList) / sizeof(testStruct);
_numSubTests = 2 * 2 * testListSize;
}
OCLPerfDispatchSpeed::~OCLPerfDispatchSpeed() {}
static void CL_CALLBACK notify_callback(const char *errinfo,
const void *private_info, size_t cb,
void *user_data) {}
void OCLPerfDispatchSpeed::open(unsigned int test, char *units,
double &conversion, unsigned int deviceId) {
cl_uint numPlatforms;
cl_platform_id platform = NULL;
cl_uint num_devices = 0;
cl_device_id *devices = NULL;
cl_device_id device = NULL;
_crcword = 0;
conversion = 1.0f;
_deviceId = deviceId;
_openTest = test % testListSize;
context_ = 0;
cmd_queue_ = 0;
program_ = 0;
kernel_ = 0;
outBuffer_ = 0;
sleep = false;
doWarmup = false;
if ((test / testListSize) % 2) {
doWarmup = true;
}
if (test >= (testListSize * 2)) {
sleep = true;
}
bufSize_ = 64 * sizeof(cl_float);
error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
if (0 < numPlatforms) {
cl_platform_id *platforms = new cl_platform_id[numPlatforms];
error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed");
#if 0
// Get last for default
platform = platforms[numPlatforms-1];
for (unsigned i = 0; i < numPlatforms; ++i) {
#endif
platform = platforms[_platformIndex];
char pbuf[100];
error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex],
CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf,
NULL);
num_devices = 0;
/* Get the number of requested devices */
error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL,
&num_devices);
// Runtime returns an error when no GPU devices are present instead of just
// returning 0 devices
// CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
// Choose platform with GPU devices
// if (num_devices > 0)
//{
// platform = platforms[_platformIndex];
// break;
//}
#if 0
}
#endif
delete platforms;
} else {
CHECK_RESULT(numPlatforms == 0, "No platforms available!");
}
/*
* If we could find our platform, use it. If not, die as we need the AMD
* platform for these extensions.
*/
CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed");
devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
CHECK_RESULT(devices == 0, "no devices");
/* Get the requested device */
error_ =
_wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL);
CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed");
CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available");
device = devices[_deviceId];
context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL,
&error_);
CHECK_RESULT(context_ == 0, "clCreateContext failed");
cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, 0, NULL);
CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed");
outBuffer_ = _wrapper->clCreateBuffer(context_, 0, bufSize_, NULL, &error_);
CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
genShader();
char *tmp = (char *)shader_.c_str();
program_ = _wrapper->clCreateProgramWithSource(
context_, 1, (const char **)&tmp, NULL, &error_);
CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed");
error_ = _wrapper->clBuildProgram(program_, 1, &device, "", NULL, NULL);
if (error_ != CL_SUCCESS) {
cl_int intError;
char log[16384];
intError =
_wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG,
16384 * sizeof(char), log, NULL);
printf("Build error -> %s\n", log);
CHECK_RESULT(0, "clBuildProgram failed");
}
kernel_ = _wrapper->clCreateKernel(program_, "_dispatchSpeed", &error_);
CHECK_RESULT(kernel_ == 0, "clCreateKernel failed");
error_ =
_wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer_);
}
void OCLPerfDispatchSpeed::run(void) {
int global = bufSize_ / sizeof(cl_float);
int local = 64;
size_t global_work_size[1] = {(size_t)global};
size_t local_work_size[1] = {(size_t)local};
CPerfCounter timer;
cl_event event;
cl_int eventStatus;
if (doWarmup) {
error_ = _wrapper->clEnqueueNDRangeKernel(
cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
(const size_t *)local_work_size, 0, NULL, &event);
CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
_wrapper->clFinish(cmd_queue_);
}
timer.Reset();
timer.Start();
for (unsigned int i = 0; i < testList[_openTest].iterations; i++) {
error_ = _wrapper->clEnqueueNDRangeKernel(
cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
(const size_t *)local_work_size, 0, NULL, &event);
CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
if ((testList[_openTest].flushEvery > 0) &&
(((i + 1) % testList[_openTest].flushEvery) == 0)) {
if (sleep) {
_wrapper->clFinish(cmd_queue_);
} else {
_wrapper->clFlush(cmd_queue_);
error_ =
_wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
sizeof(cl_int), &eventStatus, NULL);
while (eventStatus > 0) {
error_ =
_wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
sizeof(cl_int), &eventStatus, NULL);
}
}
}
if (i != (testList[_openTest].iterations - 1)) {
_wrapper->clReleaseEvent(event);
}
}
if (sleep) {
_wrapper->clFinish(cmd_queue_);
} else {
_wrapper->clFlush(cmd_queue_);
error_ = _wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
sizeof(cl_int), &eventStatus, NULL);
while (eventStatus > 0) {
error_ =
_wrapper->clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
sizeof(cl_int), &eventStatus, NULL);
}
}
_wrapper->clReleaseEvent(event);
timer.Stop();
double sec = timer.GetElapsedTime();
// microseconds per launch
double perf = (1000000.f * sec / testList[_openTest].iterations);
const char *waitType;
const char *extraChar;
const char *n;
const char *warmup;
if (sleep) {
waitType = "sleep";
extraChar = "";
n = "";
} else {
waitType = "spin";
n = "n";
extraChar = " ";
}
if (doWarmup) {
warmup = "warmup";
} else {
warmup = "";
}
_perfInfo = (float)perf;
char buf[256];
if (testList[_openTest].flushEvery > 0) {
SNPRINTF(buf, sizeof(buf),
" %7d dispatches %s%sing every %5d %6s (us/disp)",
testList[_openTest].iterations, waitType, n,
testList[_openTest].flushEvery, warmup);
} else {
SNPRINTF(buf, sizeof(buf),
" %7d dispatches (%s%s) %6s (us/disp)",
testList[_openTest].iterations, waitType, extraChar, warmup);
}
testDescString = buf;
}
unsigned int OCLPerfDispatchSpeed::close(void) {
if (outBuffer_) {
error_ = _wrapper->clReleaseMemObject(outBuffer_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseMemObject(outBuffer_) failed");
}
if (kernel_) {
error_ = _wrapper->clReleaseKernel(kernel_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed");
}
if (program_) {
error_ = _wrapper->clReleaseProgram(program_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed");
}
if (cmd_queue_) {
error_ = _wrapper->clReleaseCommandQueue(cmd_queue_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS,
"clReleaseCommandQueue failed");
}
if (context_) {
error_ = _wrapper->clReleaseContext(context_);
CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed");
}
return _crcword;
}
OCLPerfMapDispatchSpeed::OCLPerfMapDispatchSpeed() {
testListSize = sizeof(mapTestList) / sizeof(unsigned int);
_numSubTests = 2 * testListSize;
}
void OCLPerfMapDispatchSpeed::run(void) {
cl_mem outBuffer;
outBuffer = _wrapper->clCreateBuffer(context_, CL_MEM_ALLOC_HOST_PTR,
bufSize_, NULL, &error_);
CHECK_RESULT(outBuffer_ == 0, "clCreateBuffer(outBuffer) failed");
error_ =
_wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void *)&outBuffer);
int global = bufSize_ / sizeof(cl_float);
int local = 64;
size_t global_work_size[1] = {(size_t)global};
size_t local_work_size[1] = {(size_t)local};
CPerfCounter timer;
if (doWarmup) {
error_ = _wrapper->clEnqueueNDRangeKernel(
cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
(const size_t *)local_work_size, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
_wrapper->clFinish(cmd_queue_);
}
timer.Reset();
timer.Start();
void *mem;
for (unsigned int i = 0; i < mapTestList[_openTest]; i++) {
mem = _wrapper->clEnqueueMapBuffer(cmd_queue_, outBuffer, CL_TRUE,
CL_MAP_WRITE_INVALIDATE_REGION, 0,
bufSize_, 0, NULL, NULL, &error_);
CHECK_RESULT(error_, "clEnqueueMapBuffer failed");
error_ = _wrapper->clEnqueueUnmapMemObject(cmd_queue_, outBuffer, mem, 0,
NULL, NULL);
CHECK_RESULT(error_, "clEnqueueUnmapBuffer failed");
error_ = _wrapper->clEnqueueNDRangeKernel(
cmd_queue_, kernel_, 1, NULL, (const size_t *)global_work_size,
(const size_t *)local_work_size, 0, NULL, NULL);
CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed");
}
_wrapper->clFinish(cmd_queue_);
timer.Stop();
double sec = timer.GetElapsedTime();
// microseconds per launch
double perf = (1000000.f * sec / mapTestList[_openTest]);
const char *warmup;
if (doWarmup) {
warmup = "warmup";
} else {
warmup = "";
}
_perfInfo = (float)perf;
char buf[256];
SNPRINTF(buf, sizeof(buf), " %7d maps and dispatches %6s (us/disp)",
mapTestList[_openTest], warmup);
testDescString = buf;
_wrapper->clReleaseMemObject(outBuffer);
}
@@ -0,0 +1,58 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_DispatchSpeed_H_
#define _OCL_DispatchSpeed_H_
#include "OCLTestImp.h"
class OCLPerfDispatchSpeed : public OCLTestImp {
public:
OCLPerfDispatchSpeed();
virtual ~OCLPerfDispatchSpeed();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
std::string shader_;
void genShader(void);
cl_context context_;
cl_command_queue cmd_queue_;
cl_program program_;
cl_kernel kernel_;
cl_mem outBuffer_;
cl_int error_;
bool doWarmup;
unsigned int bufSize_;
bool sleep;
unsigned int testListSize;
};
class OCLPerfMapDispatchSpeed : public OCLPerfDispatchSpeed {
public:
OCLPerfMapDispatchSpeed();
virtual void run(void);
};
#endif // _OCL_DispatchSpeed_H_
@@ -0,0 +1,442 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "OCLPerfDoubleDMA.h"
#include <Timer.h>
#include <assert.h>
#include <stdio.h>
#include <cmath>
#include <sstream>
#include <string>
#include "CL/cl.h"
#include "CL/cl_ext.h"
const size_t blockX = 256;
const size_t blockY = 256;
const size_t blockZ = 512;
const size_t chunk = 16;
const size_t size_S = blockX * blockY * blockZ * sizeof(cl_float4);
const size_t size_s = blockX * blockY * chunk * sizeof(cl_float4);
static const int WindowWidth = 80;
const size_t MaxQueues = 3;
bool profEnable = false;
static const char* strKernel =
"__kernel void dummy(__global float4* out) \n"
"{ \n"
" uint id = get_global_id(0); \n"
" float4 value = (float4)(1.0f, 2.0f, 3.0f, 4.0f); \n"
" uint factorial = 1; \n"
" for (uint i = 1; i < (id / 0x400); ++i)\n"
" { \n"
" factorial *= i; \n"
" } \n"
" out[id] = value * factorial; \n"
"} \n";
class ProfileQueue {
public:
enum Operation { Write = 0, Execute, Read, Total };
static const char* OperationName[Total];
static const char StartCommand[Total];
static const char ExecCommand[Total];
ProfileQueue() {}
~ProfileQueue() {
for (size_t op = 0; op < Total; ++op) {
for (size_t idx = 0; idx < events_[op].size(); ++idx) {
clReleaseEvent(events_[op][idx]);
}
}
}
void addEvent(Operation op, cl_event event) { events_[op].push_back(event); }
void findMinMax(cl_long* min, cl_long* max) {
// Find time min/max ranges for the frame scaling
for (size_t op = 0; (op < ProfileQueue::Total); ++op) {
cl_long time;
if (events_[op].size() == 0) continue;
clGetEventProfilingInfo(events_[op][0], CL_PROFILING_COMMAND_START,
sizeof(cl_long), &time, NULL);
if (0 == *min) {
*min = time;
} else {
*min = std::min(*min, time);
}
clGetEventProfilingInfo(events_[op][events_[op].size() - 1],
CL_PROFILING_COMMAND_END, sizeof(cl_long), &time,
NULL);
if (0 == *max) {
*max = time;
} else {
*max = std::max(*max, time);
}
}
}
void display(cl_long start, cl_long finish) {
std::string graph;
graph.resize(WindowWidth + 1);
graph[WindowWidth] = '\x0';
cl_long timeFrame = finish - start;
cl_long interval = timeFrame / WindowWidth;
// Find time min/max ranges for the frame scaling
for (size_t op = 0; (op < Total); ++op) {
if (events_[op].size() == 0) continue;
cl_long timeStart, timeEnd;
int begin = 0, end = 0;
for (size_t idx = 0; idx < events_[op].size(); ++idx) {
bool cutStart = false;
clGetEventProfilingInfo(events_[op][idx], CL_PROFILING_COMMAND_START,
sizeof(cl_long), &timeStart, NULL);
clGetEventProfilingInfo(events_[op][idx], CL_PROFILING_COMMAND_END,
sizeof(cl_long), &timeEnd, NULL);
// Continue if out of the frame scope
if (timeStart >= finish) continue;
if (timeEnd <= start) continue;
if (timeStart <= start) {
timeStart = start;
cutStart = true;
}
if (timeEnd >= finish) {
timeEnd = finish;
}
// Readjust time to the frame
timeStart -= start;
timeEnd -= start;
timeStart = static_cast<cl_long>(
floor(static_cast<float>(timeStart) / interval + 0.5f));
timeEnd = static_cast<cl_long>(
floor(static_cast<float>(timeEnd) / interval + 0.5f));
begin = static_cast<int>(timeStart);
// Idle from end to begin
for (int c = end; c < begin; ++c) {
graph[c] = '-';
}
end = static_cast<int>(timeEnd);
for (int c = begin; c < end; ++c) {
if ((c == begin) && !cutStart) {
graph[c] = StartCommand[op];
} else {
graph[c] = ExecCommand[op];
}
}
if ((begin == end) && (end < WindowWidth)) {
graph[begin] = '+';
}
}
if (end < WindowWidth) {
for (int c = end; c < WindowWidth; ++c) {
graph[c] = '-';
}
}
printf("%s\n", graph.c_str());
}
}
private:
// Profiling events
std::vector<cl_event> events_[Total];
};
const char* ProfileQueue::OperationName[Total] = {
"BufferWrite", "KernelExecution", "BufferRead"};
const char ProfileQueue::StartCommand[Total] = {'W', 'X', 'R'};
const char ProfileQueue::ExecCommand[Total] = {'>', '#', '<'};
class Profile {
public:
Profile(bool profEna, int numQueues)
: profileEna_(profEna),
numQueues_(numQueues),
min_(0),
max_(0),
execTime_(0) {}
~Profile() {}
void addEvent(int queue, ProfileQueue::Operation op, cl_event event) {
if (profileEna_) {
profQueue[queue].addEvent(op, event);
}
}
cl_long findExecTime() {
if (execTime_ != 0) return execTime_;
for (int q = 0; q < numQueues_; ++q) {
profQueue[q].findMinMax(&min_, &max_);
}
execTime_ = max_ - min_;
return execTime_;
}
void display(cl_long start, cl_long finish) {
if (!profileEna_) return;
printf("\n ----------- Time frame %.3f (us), scale 1:%.0f\n",
(float)(finish - start) / 1000,
(float)(finish - start) / (1000 * WindowWidth));
for (size_t op = 0; (op < ProfileQueue::Total); ++op) {
printf("%s - %c%c; ", ProfileQueue::OperationName[op],
ProfileQueue::StartCommand[op], ProfileQueue::ExecCommand[op]);
}
printf("\n");
for (int q = 0; q < numQueues_; ++q) {
printf("CommandQueue #%d\n", q);
profQueue[q].display(min_ + start, min_ + finish);
}
}
private:
bool profileEna_;
int numQueues_; //!< Total number of queues
cl_long min_; //!< Min HW timestamp
cl_long max_; //!< Max HW timestamp
cl_long execTime_; //!< Profile time
ProfileQueue profQueue[MaxQueues];
};
OCLPerfDoubleDMA::OCLPerfDoubleDMA() {
_numSubTests = 2 * MaxQueues * 2;
failed_ = false;
}
OCLPerfDoubleDMA::~OCLPerfDoubleDMA() {}
void OCLPerfDoubleDMA::open(unsigned int test, char* units, double& conversion,
unsigned int deviceId) {
_deviceId = deviceId;
OCLTestImp::open(test, units, conversion, deviceId);
CHECK_RESULT((error_ != CL_SUCCESS), "Error opening test");
test_ = test;
cl_device_type deviceType;
error_ = _wrapper->clGetDeviceInfo(devices_[deviceId], CL_DEVICE_TYPE,
sizeof(deviceType), &deviceType, NULL);
CHECK_RESULT((error_ != CL_SUCCESS), "CL_DEVICE_TYPE failed");
if (!(deviceType & CL_DEVICE_TYPE_GPU)) {
printf("GPU device is required for this test!\n");
failed_ = true;
return;
}
program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL,
&error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed");
error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL,
NULL, NULL);
if (error_ != CL_SUCCESS) {
char programLog[1024];
_wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
CL_PROGRAM_BUILD_LOG, 1024, programLog, 0);
printf("\n%s\n", programLog);
fflush(stdout);
}
CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed");
kernel_ = _wrapper->clCreateKernel(program_, "dummy", &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed");
size_t bufSize = size_s;
cl_mem buffer;
if (test_ >= (2 * MaxQueues)) {
profEnable = true;
}
test_ %= 2 * MaxQueues;
size_t numBufs = (test_ % MaxQueues) + 1;
for (size_t b = 0; b < numBufs; ++b) {
buffer = _wrapper->clCreateBuffer(context_, CL_MEM_READ_WRITE, bufSize,
NULL, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
buffers_.push_back(buffer);
}
buffer = _wrapper->clCreateBuffer(context_,
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
size_S, NULL, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateBuffer() failed");
buffers_.push_back(buffer);
}
static void CL_CALLBACK notify_callback(const char* errinfo,
const void* private_info, size_t cb,
void* user_data) {}
void OCLPerfDoubleDMA::run(void) {
if (failed_) {
return;
}
CPerfCounter timer;
const int numQueues = (test_ % MaxQueues) + 1;
const bool useKernel = ((test_ / MaxQueues) > 0);
const int numBufs = numQueues;
Profile profile(profEnable, numQueues);
std::vector<cl_command_queue> cmdQueues(numQueues);
int q;
cl_command_queue_properties qProp =
(profEnable) ? CL_QUEUE_PROFILING_ENABLE : 0;
for (q = 0; q < numQueues; ++q) {
cl_command_queue cmdQueue = _wrapper->clCreateCommandQueue(
context_, devices_[_deviceId], qProp, &error_);
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateCommandQueue() failed");
cmdQueues[q] = cmdQueue;
}
float* Data_s = (float*)_wrapper->clEnqueueMapBuffer(
cmdQueues[0], buffers_[numBufs], CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
size_S, 0, NULL, NULL, &error_);
size_t gws[1] = {size_s / (4 * sizeof(float))};
size_t lws[1] = {256};
// Warm-up
for (q = 0; q < numQueues; ++q) {
error_ |=
_wrapper->clEnqueueWriteBuffer(cmdQueues[q], buffers_[q], CL_FALSE, 0,
size_s, (char*)Data_s, 0, NULL, NULL);
error_ |= _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
(void*)&buffers_[q]);
error_ |= _wrapper->clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL,
gws, lws, 0, NULL, NULL);
error_ |=
_wrapper->clEnqueueReadBuffer(cmdQueues[q], buffers_[q], CL_FALSE, 0,
size_s, (char*)Data_s, 0, NULL, NULL);
error_ |= _wrapper->clFinish(cmdQueues[q]);
}
CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "Execution failed");
size_t s_done = 0;
cl_event r[MaxQueues] = {0}, w[MaxQueues] = {0}, x[MaxQueues] = {0};
/*---------- pass2: copy Data_s to and from GPU Buffers ----------*/
s_done = 0;
timer.Reset();
timer.Start();
int idx = numBufs - 1;
// Start from the last so read/write won't go to the same DMA when kernel is
// executed
q = numQueues - 1;
size_t iter = 0;
while (1) {
if (0 == r[idx]) {
error_ |= _wrapper->clEnqueueWriteBuffer(
cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s,
(char*)Data_s + s_done, 0, NULL, &w[idx]);
} else {
error_ |= _wrapper->clEnqueueWriteBuffer(
cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s,
(char*)Data_s + s_done, 1, &r[idx], &w[idx]);
if (!profEnable) {
error_ |= _wrapper->clReleaseEvent(r[idx]);
}
}
_wrapper->clFlush(cmdQueues[q]);
profile.addEvent(q, ProfileQueue::Write, w[idx]);
if (useKernel) {
// Change the queue
++q %= numQueues;
// Implicit flush of DMA engine on kernel start, because memory dependency
error_ |= _wrapper->clSetKernelArg(kernel_, 0, sizeof(cl_mem),
(void*)&buffers_[idx]);
error_ |= _wrapper->clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL,
gws, lws, 1, &w[idx], &x[idx]);
if (!profEnable) {
error_ |= _wrapper->clReleaseEvent(w[idx]);
}
profile.addEvent(q, ProfileQueue::Execute, x[idx]);
}
_wrapper->clFlush(cmdQueues[q]);
// Change the queue
++q %= numQueues;
error_ |= _wrapper->clEnqueueReadBuffer(
cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s,
(char*)Data_s + s_done, 1, (useKernel) ? &x[idx] : &w[idx], &r[idx]);
if (!profEnable) {
error_ |= _wrapper->clReleaseEvent((useKernel) ? x[idx] : w[idx]);
}
profile.addEvent(q, ProfileQueue::Read, r[idx]);
_wrapper->clFlush(cmdQueues[q]);
if ((s_done += size_s) >= size_S) {
if (!profEnable) {
error_ |= _wrapper->clReleaseEvent(r[idx]);
}
break;
}
++iter;
++idx %= numBufs;
++q %= numQueues;
}
for (q = 0; q < numQueues; ++q) {
error_ |= _wrapper->clFinish(cmdQueues[q]);
}
timer.Stop();
error_ = _wrapper->clEnqueueUnmapMemObject(cmdQueues[0], buffers_[numBufs],
Data_s, 0, NULL, NULL);
error_ |= _wrapper->clFinish(cmdQueues[0]);
CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS), "Execution failed");
cl_long gpuTimeFrame = profile.findExecTime();
cl_long oneIter = gpuTimeFrame / iter;
// Display 4 iterations in the middle
cl_long startFrame = oneIter * (iter / 2 - 2);
cl_long finishFrame = oneIter * (iter / 2 + 2);
profile.display(startFrame, finishFrame);
for (q = 0; q < numQueues; ++q) {
error_ = _wrapper->clReleaseCommandQueue(cmdQueues[q]);
CHECK_RESULT_NO_RETURN((error_ != CL_SUCCESS),
"clReleaseCommandQueue() failed");
}
double GBytes = (double)(2 * size_S) / (double)(1000 * 1000 * 1000);
_perfInfo = static_cast<float>(GBytes / timer.GetElapsedTime());
std::stringstream stream;
if (useKernel) {
stream << "Write/Kernel/Read operation ";
} else {
stream << "Write/Read operation ";
}
stream << numQueues << " queues; profiling "
<< ((profEnable) ? "enabled" : "disabled") << " [GB/s]";
stream.flags(std::ios::right | std::ios::showbase);
testDescString = stream.str();
}
unsigned int OCLPerfDoubleDMA::close(void) { return OCLTestImp::close(); }
@@ -0,0 +1,42 @@
/* Copyright (c) 2010-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef _OCL_PERF_DOUBLE_DMA_H_
#define _OCL_PERF_DOUBLE_DMA_H_
#include "OCLTestImp.h"
class OCLPerfDoubleDMA : public OCLTestImp {
public:
OCLPerfDoubleDMA();
virtual ~OCLPerfDoubleDMA();
public:
virtual void open(unsigned int test, char* units, double& conversion,
unsigned int deviceID);
virtual void run(void);
virtual unsigned int close(void);
private:
bool failed_;
unsigned int test_;
};
#endif // _OCL_PERF_DOUBLE_DMA_H_

Some files were not shown because too many files have changed in this diff Show More