SWDEV-299773 - Enable performance tests on NV (#2337)
1. Simply enable test on NV
Some need minor fix
performance/compute/hipPerfDotProduct.cpp
performance/dispatch/hipPerfDispatchSpeed.cpp
performance/memory/hipPerfBufferCopyRectSpeed.cpp
performance/memory/hipPerfBufferCopySpeed.cpp
performance/memory/hipPerfDevMemReadSpeed.cpp
performance/memory/hipPerfDevMemWriteSpeed.cpp
performance/memory/hipPerfMemcpy.cpp
performance/memory/hipPerfMemset.cpp
performance/memory/hipPerfSharedMemReadSpeed.cpp
performance/stream/hipPerfDeviceConcurrency.cpp
performance/stream/hipPerfStreamCreateCopyDestroy.cpp
2. Enable and fix on NV
performance/compute/hipPerfMandelbrot.cpp
Root cause: coordIdx is random
Solution: Initialize coordIdx correctly
performance/memory/hipPerfMemFill.cpp
Root cause: Hip ext Apis called.
Solution: Exclude case with Hip ext Apis involved
performance/memory/hipPerfMemMallocCpyFree.cpp
Root cause: Test allocates device memory more than GPU has.
Solution: Allocate device memory in terms of GPU capacity.
tests/performance/memory/hipPerfSampleRate.cpp
Root cause: Cuda has no operators += for float2 and float4.
Solution: Provide the operators.
performance/stream/hipPerfStreamConcurrency.cpp
Root cause:float4 format doesn't match cude.
operators are missing in cuda lib.
Solution: Use (x, y, z, w) format.
Add necessary float4 operatoris for cuda.
Change-Id: I5add29ebabcfb21fb3ef89d09004c5d13423a291
This commit is contained in:
@@ -18,7 +18,7 @@
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s ../../src/test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s ../../src/test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
@@ -489,6 +489,7 @@ void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos,
|
||||
void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
|
||||
|
||||
unsigned int numStreams = getNumStreams();
|
||||
coordIdx = testCase % numCoords;
|
||||
|
||||
funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
|
||||
&hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
|
||||
@@ -555,9 +556,6 @@ void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
|
||||
double totalTime = 0.0;
|
||||
|
||||
for (unsigned int k = 0; k < numLoops; k++) {
|
||||
|
||||
coordIdx = testCase % numCoords;
|
||||
|
||||
if ((testCase == 0 || testCase == 1 || testCase == 2 ||
|
||||
testCase == 5 || testCase == 6 || testCase == 7 ||
|
||||
testCase == 10 || testCase == 11 || testCase == 12)) {
|
||||
@@ -653,7 +651,7 @@ void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
|
||||
|
||||
// Free host and device memory
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipFree(hPtr[i]));
|
||||
HIPCHECK(hipHostFree(hPtr[i]));
|
||||
HIPCHECK(hipFree(dPtr[i]));
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
@@ -18,7 +18,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
@@ -18,7 +18,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
@@ -18,7 +18,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s ../../src/test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
@@ -18,7 +18,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s ../../src/test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s ../../src/test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
@@ -136,6 +136,7 @@ class hipPerfMemFill {
|
||||
}
|
||||
|
||||
HIPCHECK(hipSetDevice(deviceId));
|
||||
memset(&props_, 0, sizeof(props_));
|
||||
HIPCHECK(hipGetDeviceProperties(&props_, deviceId));
|
||||
std::cout << "Info: running on device: id: " << deviceId << ", bus: 0x"
|
||||
<< props_.pciBusID << " " << props_.name << " with "
|
||||
@@ -397,8 +398,9 @@ class hipPerfMemFill {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* This fuction should be via device attribute query*/
|
||||
/* This function should be via device attribute query*/
|
||||
bool supportDeviceMallocFinegrained() {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
T *A = nullptr;
|
||||
hipExtMallocWithFlags((void **)&A, sizeof(T), hipDeviceMallocFinegrained);
|
||||
if (!A) {
|
||||
@@ -406,6 +408,9 @@ class hipPerfMemFill {
|
||||
}
|
||||
HIPCHECK(hipFree(A));
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
unsigned int setNumBlocks(size_t size) {
|
||||
@@ -419,6 +424,7 @@ class hipPerfMemFill {
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
bool testExtDeviceMemoryHostFill(size_t size, unsigned int flags) {
|
||||
double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0);
|
||||
|
||||
@@ -481,6 +487,7 @@ class hipPerfMemFill {
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool run() {
|
||||
if (supportLargeBar()) {
|
||||
@@ -499,11 +506,13 @@ class hipPerfMemFill {
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
if (supportDeviceMallocFinegrained()) {
|
||||
if (!testExtDeviceMemory()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ THE SOFTWARE.
|
||||
#include <time.h>
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s ../../src/test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
@@ -37,10 +37,15 @@ void valSet(int* A, int val, size_t size) {
|
||||
}
|
||||
}
|
||||
|
||||
void setup(size_t *size, const int num, int **pA) {
|
||||
void setup(size_t *size, int &num, int **pA, const size_t totalGlobalMem) {
|
||||
|
||||
std::cout << "size: ";
|
||||
for (int i = 0; i < num; i++) {
|
||||
size[i] = 1 << (i + 6);
|
||||
if((NUM_ITER + 1) * size[i] > totalGlobalMem) {
|
||||
num = i;
|
||||
break;
|
||||
}
|
||||
std::cout << size[i] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
@@ -77,11 +82,16 @@ int main() {
|
||||
size_t size[NUM_SIZE] = { 0 };
|
||||
int *Ad[NUM_ITER] = { nullptr };
|
||||
int *A;
|
||||
hipDeviceProp_t props;
|
||||
memset(&props, 0, sizeof(props));
|
||||
HIPCHECK(hipGetDeviceProperties(&props, 0));
|
||||
std::cout << "totalGlobalMem: " << props.totalGlobalMem << std::endl;
|
||||
|
||||
setup(size, NUM_SIZE, &A);
|
||||
int num = NUM_SIZE;
|
||||
setup(size, num, &A, props.totalGlobalMem);
|
||||
testInit(size[0], A);
|
||||
|
||||
for (int i = 0; i < NUM_SIZE; i++) {
|
||||
for (int i = 0; i < num; i++) {
|
||||
std::cout << size[i] << std::endl;
|
||||
start = clock();
|
||||
for (int j = 0; j < NUM_ITER; j++) {
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s ../../src/test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s ../../src/test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
@@ -140,13 +140,13 @@ void hipPerfMemset::run1D(unsigned int test, T memsetval, enum MemsetType type,
|
||||
HIPCHECK(hipStreamCreate(&stream));
|
||||
|
||||
// Warm-up
|
||||
HIPCHECK(hipMemset((hipDeviceptr_t)A_d, memsetval, bufSize_));
|
||||
HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_));
|
||||
|
||||
auto start = chrono::steady_clock::now();
|
||||
|
||||
for (uint i = 0; i < NUM_ITER; i++) {
|
||||
if (type == hipMemsetTypeDefault && !async) {
|
||||
HIPCHECK(hipMemset((hipDeviceptr_t)A_d, memsetval, bufSize_));
|
||||
HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_));
|
||||
}
|
||||
else if (type == hipMemsetTypeDefault && async) {
|
||||
HIPCHECK(hipMemsetAsync(A_d, memsetval, bufSize_, stream));
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s ../../src/test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
@@ -42,6 +42,18 @@ vector<unsigned int> sizes = {1, 2, 4, 8, 16, 32,
|
||||
#define NUM_BUFS 6
|
||||
#define MAX_BUFS (1 << (NUM_BUFS - 1))
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVIDIA__
|
||||
inline __host__ __device__ void operator+=(float2 &a, float2 b)
|
||||
{
|
||||
a.x += b.x; a.y += b.y;
|
||||
}
|
||||
|
||||
inline __host__ __device__ void operator+=(float4 &a, float4 b)
|
||||
{
|
||||
a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
__global__ void sampleRate(T * outBuffer, unsigned int inBufSize, unsigned int writeIt,
|
||||
T **inBuffer, int numBufs) {
|
||||
@@ -49,7 +61,8 @@ __global__ void sampleRate(T * outBuffer, unsigned int inBufSize, unsigned int w
|
||||
uint gid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
uint inputIdx = gid % inBufSize;
|
||||
|
||||
T tmp = (T)0.0f;
|
||||
T tmp;
|
||||
memset(&tmp, 0, sizeof(T));
|
||||
for(int i = 0; i < numBufs; i++) {
|
||||
tmp += *(*(inBuffer+i)+inputIdx);
|
||||
}
|
||||
@@ -264,11 +277,11 @@ void hipPerfSampleRate::run(unsigned int test) {
|
||||
|
||||
// Free host and device memory
|
||||
for (uint i = 0; i < numBufs_; i++) {
|
||||
HIPCHECK(hipFree(hInPtr[i]));
|
||||
HIPCHECK(hipHostFree(hInPtr[i]));
|
||||
HIPCHECK(hipFree(dInPtr[i]));
|
||||
}
|
||||
|
||||
HIPCHECK(hipFree(hOutPtr));
|
||||
HIPCHECK(hipHostFree(hOutPtr));
|
||||
HIPCHECK(hipFree(dPtr));
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s ../../src/test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s ../../src/test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
@@ -226,7 +226,7 @@ void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
|
||||
HIPCHECK(hipStreamDestroy(streams[i]));
|
||||
|
||||
// Free host and device memory
|
||||
HIPCHECK(hipFree(hPtr[i]));
|
||||
HIPCHECK(hipHostFree(hPtr[i]));
|
||||
HIPCHECK(hipFree(dPtr[i]));
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s ../../src/test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
@@ -28,6 +28,24 @@
|
||||
#include "test_common.h"
|
||||
#include <hip/hip_vector_types.h>
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVIDIA__
|
||||
inline __device__ float4 operator*(float s, float4 a)
|
||||
{
|
||||
return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
|
||||
}
|
||||
inline __device__ float4 operator*(float4 a, float4 b)
|
||||
{
|
||||
return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
|
||||
}
|
||||
inline __device__ float4 operator+(float4 a, float4 b)
|
||||
{
|
||||
return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
|
||||
}
|
||||
inline __device__ float4 operator-(float4 a, float4 b)
|
||||
{
|
||||
return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
|
||||
}
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
double x;
|
||||
@@ -39,157 +57,130 @@ static coordRec coords[] = {
|
||||
{0.0, 0.0, 0.00001}, // All black
|
||||
};
|
||||
|
||||
|
||||
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
|
||||
|
||||
__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter) {
|
||||
|
||||
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % (width/4);
|
||||
int j = tid / (width/4);
|
||||
int4 veci = make_int4(4*i, 4*i+1, 4*i+2, 4*i+3);
|
||||
int4 vecj = make_int4(j, j, j, j);
|
||||
float4 x0;
|
||||
x0.data[0] = (float)(xPos + xStep*veci.data[0]);
|
||||
x0.data[1] = (float)(xPos + xStep*veci.data[1]);
|
||||
x0.data[2] = (float)(xPos + xStep*veci.data[2]);
|
||||
x0.data[3] = (float)(xPos + xStep*veci.data[3]);
|
||||
x0.x = (float)(xPos + xStep*veci.x);
|
||||
x0.y = (float)(xPos + xStep*veci.y);
|
||||
x0.z = (float)(xPos + xStep*veci.z);
|
||||
x0.w = (float)(xPos + xStep*veci.w);
|
||||
float4 y0;
|
||||
y0.data[0] = (float)(yPos + yStep*vecj.data[0]);
|
||||
y0.data[1] = (float)(yPos + yStep*vecj.data[1]);
|
||||
y0.data[2] = (float)(yPos + yStep*vecj.data[2]);
|
||||
y0.data[3] = (float)(yPos + yStep*vecj.data[3]);
|
||||
|
||||
y0.x = (float)(yPos + yStep*vecj.x);
|
||||
y0.y = (float)(yPos + yStep*vecj.y);
|
||||
y0.z = (float)(yPos + yStep*vecj.z);
|
||||
y0.w = (float)(yPos + yStep*vecj.w);
|
||||
float4 x = x0;
|
||||
float4 y = y0;
|
||||
|
||||
uint iter = 0;
|
||||
float4 tmp;
|
||||
int4 stay;
|
||||
int4 ccount = make_int4(0, 0, 0, 0);
|
||||
|
||||
float4 savx = x;
|
||||
float4 savy = y;
|
||||
|
||||
stay.data[0] = (x.data[0]*x.data[0]+y.data[0]*y.data[0]) <= (float)(4.0f);
|
||||
stay.data[1] = (x.data[1]*x.data[1]+y.data[1]*y.data[1]) <= (float)(4.0f);
|
||||
stay.data[2] = (x.data[2]*x.data[2]+y.data[2]*y.data[2]) <= (float)(4.0f);
|
||||
stay.data[3] = (x.data[3]*x.data[3]+y.data[3]*y.data[3]) <= (float)(4.0f);
|
||||
|
||||
for (iter = 0; (stay.data[0] | stay.data[1] | stay.data[2] | stay.data[3]) && (iter < maxIter);
|
||||
stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f);
|
||||
stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f);
|
||||
stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f);
|
||||
stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f);
|
||||
for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter);
|
||||
iter+=16) {
|
||||
|
||||
|
||||
x = savx;
|
||||
y = savy;
|
||||
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
|
||||
stay.data[0] = (x.data[0]*x.data[0]+y.data[0]*y.data[0]) <= (float)(4.0f);
|
||||
stay.data[1] = (x.data[1]*x.data[1]+y.data[1]*y.data[1]) <= (float)(4.0f);
|
||||
stay.data[2] = (x.data[2]*x.data[2]+y.data[2]*y.data[2]) <= (float)(4.0f);
|
||||
stay.data[3] = (x.data[3]*x.data[3]+y.data[3]*y.data[3]) <= (float)(4.0f);
|
||||
|
||||
savx.data[0] = (bool)(stay.data[0] ? x.data[0] : savx.data[0]);
|
||||
savx.data[1] = (bool)(stay.data[1] ? x.data[1] : savx.data[1]);
|
||||
savx.data[2] = (bool)(stay.data[2] ? x.data[2] : savx.data[2]);
|
||||
savx.data[3] = (bool)(stay.data[3] ? x.data[3] : savx.data[3]);
|
||||
|
||||
savy.data[0] = (bool)(stay.data[0] ? y.data[0] : savy.data[0]);
|
||||
savy.data[1] = (bool)(stay.data[1] ? y.data[1] : savy.data[1]);
|
||||
savy.data[2] = (bool)(stay.data[2] ? y.data[2] : savy.data[2]);
|
||||
savy.data[3] = (bool)(stay.data[3] ? y.data[3] : savy.data[3]);
|
||||
|
||||
ccount.data[0] -= stay.data[0]*16;
|
||||
ccount.data[1] -= stay.data[1]*16;
|
||||
ccount.data[2] -= stay.data[2]*16;
|
||||
ccount.data[3] -= stay.data[3]*16;
|
||||
}
|
||||
|
||||
|
||||
stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f);
|
||||
stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f);
|
||||
stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f);
|
||||
stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f);
|
||||
savx.x = (bool)(stay.x ? x.x : savx.x);
|
||||
savx.y = (bool)(stay.y ? x.y : savx.y);
|
||||
savx.z = (bool)(stay.z ? x.z : savx.z);
|
||||
savx.w = (bool)(stay.w ? x.w : savx.w);
|
||||
savy.x = (bool)(stay.x ? y.x : savy.x);
|
||||
savy.y = (bool)(stay.y ? y.y : savy.y);
|
||||
savy.z = (bool)(stay.z ? y.z : savy.z);
|
||||
savy.w = (bool)(stay.w ? y.w : savy.w);
|
||||
ccount.x -= stay.x*16;
|
||||
ccount.y -= stay.y*16;
|
||||
ccount.z -= stay.z*16;
|
||||
ccount.w -= stay.w*16;
|
||||
}
|
||||
// Handle remainder
|
||||
if (!(stay.data[0] & stay.data[1] & stay.data[2] & stay.data[3]))
|
||||
if (!(stay.x & stay.y & stay.z & stay.w))
|
||||
{
|
||||
iter = 16;
|
||||
do
|
||||
{
|
||||
x = savx;
|
||||
y = savy;
|
||||
stay.x = ((x.data[0]*x.data[0]+y.data[0]*y.data[0]) <= 4.0f) && (ccount.data[0] < maxIter);
|
||||
stay.y = ((x.data[1]*x.data[1]+y.data[1]*y.data[1]) <= 4.0f) && (ccount.data[1] < maxIter);
|
||||
stay.z = ((x.data[2]*x.data[2]+y.data[2]*y.data[2]) <= 4.0f) && (ccount.data[2] < maxIter);
|
||||
stay.w = ((x.data[3]*x.data[3]+y.data[3]*y.data[3]) <= 4.0f) && (ccount.data[3] < maxIter);
|
||||
stay.x = ((x.x*x.x+y.x*y.x) <= 4.0f) && (ccount.x < maxIter);
|
||||
stay.y = ((x.y*x.y+y.y*y.y) <= 4.0f) && (ccount.y < maxIter);
|
||||
stay.z = ((x.z*x.z+y.z*y.z) <= 4.0f) && (ccount.z < maxIter);
|
||||
stay.w = ((x.w*x.w+y.w*y.w) <= 4.0f) && (ccount.w < maxIter);
|
||||
tmp = x;
|
||||
x = x*x + x0 - y*y;
|
||||
y = 2.0f*tmp*y + y0;
|
||||
ccount.data[0] += stay.data[0];
|
||||
ccount.data[1] += stay.data[1];
|
||||
ccount.data[2] += stay.data[2];
|
||||
ccount.data[3] += stay.data[3];
|
||||
ccount.x += stay.x;
|
||||
ccount.y += stay.y;
|
||||
ccount.z += stay.z;
|
||||
ccount.w += stay.w;
|
||||
iter--;
|
||||
savx.data[0] = (stay.data[0] ? x.data[0] : savx.data[0]);
|
||||
savx.data[1] = (stay.data[1] ? x.data[1] : savx.data[1]);
|
||||
savx.data[2] = (stay.data[2] ? x.data[2] : savx.data[2]);
|
||||
savx.data[3] = (stay.data[3] ? x.data[3] : savx.data[3]);
|
||||
savy.data[0] = (stay.data[0] ? y.data[0] : savy.data[0]);
|
||||
savy.data[1] = (stay.data[1] ? y.data[1] : savy.data[1]);
|
||||
savy.data[2] = (stay.data[2] ? y.data[2] : savy.data[2]);
|
||||
savy.data[3] = (stay.data[3] ? y.data[3] : savy.data[3]);
|
||||
} while ((stay.data[0] | stay.data[1] | stay.data[2] | stay.data[3]) && iter);
|
||||
savx.x = (stay.x ? x.x : savx.x);
|
||||
savx.y = (stay.y ? x.y : savx.y);
|
||||
savx.z = (stay.z ? x.z : savx.z);
|
||||
savx.w = (stay.w ? x.w : savx.w);
|
||||
savy.x = (stay.x ? y.x : savy.x);
|
||||
savy.y = (stay.y ? y.y : savy.y);
|
||||
savy.z = (stay.z ? y.z : savy.z);
|
||||
savy.w = (stay.w ? y.w : savy.w);
|
||||
} while ((stay.x | stay.y | stay.z | stay.w) && iter);
|
||||
}
|
||||
|
||||
|
||||
uint4 *vecOut = (uint4 *)out;
|
||||
|
||||
vecOut[tid].data[0] = (uint)(ccount.data[0]);
|
||||
vecOut[tid].data[1] = (uint)(ccount.data[1]);
|
||||
vecOut[tid].data[2] = (uint)(ccount.data[2]);
|
||||
vecOut[tid].data[3] = (uint)(ccount.data[3]);
|
||||
vecOut[tid].x = (uint)(ccount.x);
|
||||
vecOut[tid].y = (uint)(ccount.y);
|
||||
vecOut[tid].z = (uint)(ccount.z);
|
||||
vecOut[tid].w = (uint)(ccount.w);
|
||||
}
|
||||
|
||||
|
||||
class hipPerfStreamConcurrency {
|
||||
public:
|
||||
hipPerfStreamConcurrency();
|
||||
@@ -307,7 +298,7 @@ void hipPerfStreamConcurrency::run(unsigned int testCase,unsigned int deviceId)
|
||||
|
||||
// Copy memory asynchronously and concurrently from host to device
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipMemcpyHtoDAsync(dPtr[i], hPtr[i], bufSize, streams[i % numStreams]));
|
||||
HIPCHECK(hipMemcpyHtoDAsync(reinterpret_cast<hipDeviceptr_t>(dPtr[i]), hPtr[i], bufSize, streams[i % numStreams]));
|
||||
}
|
||||
|
||||
|
||||
@@ -341,7 +332,7 @@ void hipPerfStreamConcurrency::run(unsigned int testCase,unsigned int deviceId)
|
||||
|
||||
// Copy data back from device to the host
|
||||
for(uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipMemcpyDtoHAsync(hPtr[i] ,dPtr[i], bufSize, streams[i % numStreams]));
|
||||
HIPCHECK(hipMemcpyDtoHAsync(hPtr[i], reinterpret_cast<hipDeviceptr_t>(dPtr[i]), bufSize, streams[i % numStreams]));
|
||||
}
|
||||
|
||||
|
||||
@@ -361,7 +352,7 @@ void hipPerfStreamConcurrency::run(unsigned int testCase,unsigned int deviceId)
|
||||
|
||||
// Free host and device memory
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipFree(hPtr[i]));
|
||||
HIPCHECK(hipHostFree(hPtr[i]));
|
||||
HIPCHECK(hipFree(dPtr[i]));
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
|
||||
* BUILD: %t %s ../../src/test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user