From 9035ae315434f564bf5fca6d927aff7b8dbe2bad Mon Sep 17 00:00:00 2001 From: TomSang Date: Tue, 14 Sep 2021 04:07:13 -0400 Subject: [PATCH] SWDEV-299773 - Enable performance tests on NV (#2337) 1. Simply enable test on NV Some need minor fix performance/compute/hipPerfDotProduct.cpp performance/dispatch/hipPerfDispatchSpeed.cpp performance/memory/hipPerfBufferCopyRectSpeed.cpp performance/memory/hipPerfBufferCopySpeed.cpp performance/memory/hipPerfDevMemReadSpeed.cpp performance/memory/hipPerfDevMemWriteSpeed.cpp performance/memory/hipPerfMemcpy.cpp performance/memory/hipPerfMemset.cpp performance/memory/hipPerfSharedMemReadSpeed.cpp performance/stream/hipPerfDeviceConcurrency.cpp performance/stream/hipPerfStreamCreateCopyDestroy.cpp 2. Enable and fix on NV performance/compute/hipPerfMandelbrot.cpp Root cause: coordIdx is random Solution: Initialize coordIdx correctly performance/memory/hipPerfMemFill.cpp Root cause: Hip ext Apis called. Solution: Exclude case with Hip ext Apis involved performance/memory/hipPerfMemMallocCpyFree.cpp Root cause: Test allocates device memory more than GPU has. Solution: Allocate device memory in terms of GPU capacity. tests/performance/memory/hipPerfSampleRate.cpp Root cause: Cuda has no operators += for float2 and float4. Solution: Provide the operators. performance/stream/hipPerfStreamConcurrency.cpp Root cause:float4 format doesn't match cude. operators are missing in cuda lib. Solution: Use (x, y, z, w) format. Add necessary float4 operatoris for cuda. Change-Id: I5add29ebabcfb21fb3ef89d09004c5d13423a291 --- .../performance/compute/hipPerfDotProduct.cpp | 2 +- .../performance/compute/hipPerfMandelbrot.cpp | 8 +- .../dispatch/hipPerfDispatchSpeed.cpp | 2 +- .../memory/hipPerfBufferCopyRectSpeed.cpp | 2 +- .../memory/hipPerfBufferCopySpeed.cpp | 2 +- .../memory/hipPerfDevMemReadSpeed.cpp | 2 +- .../memory/hipPerfDevMemWriteSpeed.cpp | 2 +- tests/performance/memory/hipPerfMemFill.cpp | 13 +- .../memory/hipPerfMemMallocCpyFree.cpp | 18 +- tests/performance/memory/hipPerfMemcpy.cpp | 2 +- tests/performance/memory/hipPerfMemset.cpp | 6 +- .../performance/memory/hipPerfSampleRate.cpp | 21 ++- .../memory/hipPerfSharedMemReadSpeed.cpp | 2 +- .../stream/hipPerfDeviceConcurrency.cpp | 4 +- .../stream/hipPerfStreamConcurrency.cpp | 157 +++++++++--------- .../stream/hipPerfStreamCreateCopyDestroy.cpp | 2 +- 16 files changed, 133 insertions(+), 112 deletions(-) diff --git a/tests/performance/compute/hipPerfDotProduct.cpp b/tests/performance/compute/hipPerfDotProduct.cpp index e1d444d5b5..3b2e0c72dd 100644 --- a/tests/performance/compute/hipPerfDotProduct.cpp +++ b/tests/performance/compute/hipPerfDotProduct.cpp @@ -18,7 +18,7 @@ */ /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s ../../src/test_common.cpp * TEST: %t * HIT_END */ diff --git a/tests/performance/compute/hipPerfMandelbrot.cpp b/tests/performance/compute/hipPerfMandelbrot.cpp index 8e6c3a25cc..9f9d6b404e 100644 --- a/tests/performance/compute/hipPerfMandelbrot.cpp +++ b/tests/performance/compute/hipPerfMandelbrot.cpp @@ -18,7 +18,7 @@ */ /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s ../../src/test_common.cpp * TEST: %t * HIT_END */ @@ -489,6 +489,7 @@ void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos, void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) { unsigned int numStreams = getNumStreams(); + coordIdx = testCase % numCoords; funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll, &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll}; @@ -555,9 +556,6 @@ void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) { double totalTime = 0.0; for (unsigned int k = 0; k < numLoops; k++) { - - coordIdx = testCase % numCoords; - if ((testCase == 0 || testCase == 1 || testCase == 2 || testCase == 5 || testCase == 6 || testCase == 7 || testCase == 10 || testCase == 11 || testCase == 12)) { @@ -653,7 +651,7 @@ void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) { // Free host and device memory for (uint i = 0; i < numKernels; i++) { - HIPCHECK(hipFree(hPtr[i])); + HIPCHECK(hipHostFree(hPtr[i])); HIPCHECK(hipFree(dPtr[i])); } diff --git a/tests/performance/dispatch/hipPerfDispatchSpeed.cpp b/tests/performance/dispatch/hipPerfDispatchSpeed.cpp index 74d77f2113..d3a2c8c4ad 100644 --- a/tests/performance/dispatch/hipPerfDispatchSpeed.cpp +++ b/tests/performance/dispatch/hipPerfDispatchSpeed.cpp @@ -18,7 +18,7 @@ */ /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp * TEST: %t * HIT_END */ diff --git a/tests/performance/memory/hipPerfBufferCopyRectSpeed.cpp b/tests/performance/memory/hipPerfBufferCopyRectSpeed.cpp index 7fd10673b9..78096844f5 100644 --- a/tests/performance/memory/hipPerfBufferCopyRectSpeed.cpp +++ b/tests/performance/memory/hipPerfBufferCopyRectSpeed.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp * TEST: %t * HIT_END */ diff --git a/tests/performance/memory/hipPerfBufferCopySpeed.cpp b/tests/performance/memory/hipPerfBufferCopySpeed.cpp index 0a046fa8c0..eda9a80230 100644 --- a/tests/performance/memory/hipPerfBufferCopySpeed.cpp +++ b/tests/performance/memory/hipPerfBufferCopySpeed.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp * TEST: %t * HIT_END */ diff --git a/tests/performance/memory/hipPerfDevMemReadSpeed.cpp b/tests/performance/memory/hipPerfDevMemReadSpeed.cpp index 408a1949b4..6548da94cd 100644 --- a/tests/performance/memory/hipPerfDevMemReadSpeed.cpp +++ b/tests/performance/memory/hipPerfDevMemReadSpeed.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s ../../src/test_common.cpp * TEST: %t * HIT_END */ diff --git a/tests/performance/memory/hipPerfDevMemWriteSpeed.cpp b/tests/performance/memory/hipPerfDevMemWriteSpeed.cpp index 21b19ff1de..cc48836604 100644 --- a/tests/performance/memory/hipPerfDevMemWriteSpeed.cpp +++ b/tests/performance/memory/hipPerfDevMemWriteSpeed.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s ../../src/test_common.cpp * TEST: %t * HIT_END */ diff --git a/tests/performance/memory/hipPerfMemFill.cpp b/tests/performance/memory/hipPerfMemFill.cpp index 0dbae58ce4..dd54ec6855 100644 --- a/tests/performance/memory/hipPerfMemFill.cpp +++ b/tests/performance/memory/hipPerfMemFill.cpp @@ -18,7 +18,7 @@ */ /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s ../../src/test_common.cpp * TEST: %t * HIT_END */ @@ -136,6 +136,7 @@ class hipPerfMemFill { } HIPCHECK(hipSetDevice(deviceId)); + memset(&props_, 0, sizeof(props_)); HIPCHECK(hipGetDeviceProperties(&props_, deviceId)); std::cout << "Info: running on device: id: " << deviceId << ", bus: 0x" << props_.pciBusID << " " << props_.name << " with " @@ -397,8 +398,9 @@ class hipPerfMemFill { return true; } - /* This fuction should be via device attribute query*/ + /* This function should be via device attribute query*/ bool supportDeviceMallocFinegrained() { +#ifdef __HIP_PLATFORM_AMD__ T *A = nullptr; hipExtMallocWithFlags((void **)&A, sizeof(T), hipDeviceMallocFinegrained); if (!A) { @@ -406,6 +408,9 @@ class hipPerfMemFill { } HIPCHECK(hipFree(A)); return true; +#else + return false; +#endif } unsigned int setNumBlocks(size_t size) { @@ -419,6 +424,7 @@ class hipPerfMemFill { #endif } +#ifdef __HIP_PLATFORM_AMD__ bool testExtDeviceMemoryHostFill(size_t size, unsigned int flags) { double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0); @@ -481,6 +487,7 @@ class hipPerfMemFill { return true; } +#endif bool run() { if (supportLargeBar()) { @@ -499,11 +506,13 @@ class hipPerfMemFill { return false; } +#ifdef __HIP_PLATFORM_AMD__ if (supportDeviceMallocFinegrained()) { if (!testExtDeviceMemory()) { return false; } } +#endif return true; } diff --git a/tests/performance/memory/hipPerfMemMallocCpyFree.cpp b/tests/performance/memory/hipPerfMemMallocCpyFree.cpp index 970d18a56d..94ceb68cba 100644 --- a/tests/performance/memory/hipPerfMemMallocCpyFree.cpp +++ b/tests/performance/memory/hipPerfMemMallocCpyFree.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. #include /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s ../../src/test_common.cpp * TEST: %t * HIT_END */ @@ -37,10 +37,15 @@ void valSet(int* A, int val, size_t size) { } } -void setup(size_t *size, const int num, int **pA) { +void setup(size_t *size, int &num, int **pA, const size_t totalGlobalMem) { + std::cout << "size: "; for (int i = 0; i < num; i++) { size[i] = 1 << (i + 6); + if((NUM_ITER + 1) * size[i] > totalGlobalMem) { + num = i; + break; + } std::cout << size[i] << " "; } std::cout << std::endl; @@ -77,11 +82,16 @@ int main() { size_t size[NUM_SIZE] = { 0 }; int *Ad[NUM_ITER] = { nullptr }; int *A; + hipDeviceProp_t props; + memset(&props, 0, sizeof(props)); + HIPCHECK(hipGetDeviceProperties(&props, 0)); + std::cout << "totalGlobalMem: " << props.totalGlobalMem << std::endl; - setup(size, NUM_SIZE, &A); + int num = NUM_SIZE; + setup(size, num, &A, props.totalGlobalMem); testInit(size[0], A); - for (int i = 0; i < NUM_SIZE; i++) { + for (int i = 0; i < num; i++) { std::cout << size[i] << std::endl; start = clock(); for (int j = 0; j < NUM_ITER; j++) { diff --git a/tests/performance/memory/hipPerfMemcpy.cpp b/tests/performance/memory/hipPerfMemcpy.cpp index c9dfedc99e..9751117eca 100644 --- a/tests/performance/memory/hipPerfMemcpy.cpp +++ b/tests/performance/memory/hipPerfMemcpy.cpp @@ -18,7 +18,7 @@ */ /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s ../../src/test_common.cpp * TEST: %t * HIT_END */ diff --git a/tests/performance/memory/hipPerfMemset.cpp b/tests/performance/memory/hipPerfMemset.cpp index 4d6ae88711..25b62699be 100644 --- a/tests/performance/memory/hipPerfMemset.cpp +++ b/tests/performance/memory/hipPerfMemset.cpp @@ -18,7 +18,7 @@ */ /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s ../../src/test_common.cpp * TEST: %t * HIT_END */ @@ -140,13 +140,13 @@ void hipPerfMemset::run1D(unsigned int test, T memsetval, enum MemsetType type, HIPCHECK(hipStreamCreate(&stream)); // Warm-up - HIPCHECK(hipMemset((hipDeviceptr_t)A_d, memsetval, bufSize_)); + HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_)); auto start = chrono::steady_clock::now(); for (uint i = 0; i < NUM_ITER; i++) { if (type == hipMemsetTypeDefault && !async) { - HIPCHECK(hipMemset((hipDeviceptr_t)A_d, memsetval, bufSize_)); + HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_)); } else if (type == hipMemsetTypeDefault && async) { HIPCHECK(hipMemsetAsync(A_d, memsetval, bufSize_, stream)); diff --git a/tests/performance/memory/hipPerfSampleRate.cpp b/tests/performance/memory/hipPerfSampleRate.cpp index 1cb0741158..1ecadfe749 100644 --- a/tests/performance/memory/hipPerfSampleRate.cpp +++ b/tests/performance/memory/hipPerfSampleRate.cpp @@ -18,7 +18,7 @@ */ /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s ../../src/test_common.cpp * TEST: %t * HIT_END */ @@ -42,6 +42,18 @@ vector sizes = {1, 2, 4, 8, 16, 32, #define NUM_BUFS 6 #define MAX_BUFS (1 << (NUM_BUFS - 1)) +#ifdef __HIP_PLATFORM_NVIDIA__ +inline __host__ __device__ void operator+=(float2 &a, float2 b) +{ + a.x += b.x; a.y += b.y; +} + +inline __host__ __device__ void operator+=(float4 &a, float4 b) +{ + a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; +} +#endif + template __global__ void sampleRate(T * outBuffer, unsigned int inBufSize, unsigned int writeIt, T **inBuffer, int numBufs) { @@ -49,7 +61,8 @@ __global__ void sampleRate(T * outBuffer, unsigned int inBufSize, unsigned int w uint gid = (blockIdx.x * blockDim.x + threadIdx.x); uint inputIdx = gid % inBufSize; - T tmp = (T)0.0f; + T tmp; + memset(&tmp, 0, sizeof(T)); for(int i = 0; i < numBufs; i++) { tmp += *(*(inBuffer+i)+inputIdx); } @@ -264,11 +277,11 @@ void hipPerfSampleRate::run(unsigned int test) { // Free host and device memory for (uint i = 0; i < numBufs_; i++) { - HIPCHECK(hipFree(hInPtr[i])); + HIPCHECK(hipHostFree(hInPtr[i])); HIPCHECK(hipFree(dInPtr[i])); } - HIPCHECK(hipFree(hOutPtr)); + HIPCHECK(hipHostFree(hOutPtr)); HIPCHECK(hipFree(dPtr)); } diff --git a/tests/performance/memory/hipPerfSharedMemReadSpeed.cpp b/tests/performance/memory/hipPerfSharedMemReadSpeed.cpp index 4cdabd9303..539cf4105b 100644 --- a/tests/performance/memory/hipPerfSharedMemReadSpeed.cpp +++ b/tests/performance/memory/hipPerfSharedMemReadSpeed.cpp @@ -18,7 +18,7 @@ */ /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s ../../src/test_common.cpp * TEST: %t * HIT_END */ diff --git a/tests/performance/stream/hipPerfDeviceConcurrency.cpp b/tests/performance/stream/hipPerfDeviceConcurrency.cpp index 0c1a1ec6dc..664bdb47ed 100644 --- a/tests/performance/stream/hipPerfDeviceConcurrency.cpp +++ b/tests/performance/stream/hipPerfDeviceConcurrency.cpp @@ -18,7 +18,7 @@ */ /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s ../../src/test_common.cpp * TEST: %t * HIT_END */ @@ -226,7 +226,7 @@ void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { HIPCHECK(hipStreamDestroy(streams[i])); // Free host and device memory - HIPCHECK(hipFree(hPtr[i])); + HIPCHECK(hipHostFree(hPtr[i])); HIPCHECK(hipFree(dPtr[i])); } diff --git a/tests/performance/stream/hipPerfStreamConcurrency.cpp b/tests/performance/stream/hipPerfStreamConcurrency.cpp index 557ad7823c..16e29bc06b 100644 --- a/tests/performance/stream/hipPerfStreamConcurrency.cpp +++ b/tests/performance/stream/hipPerfStreamConcurrency.cpp @@ -18,7 +18,7 @@ */ /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s ../../src/test_common.cpp * TEST: %t * HIT_END */ @@ -28,6 +28,24 @@ #include "test_common.h" #include +#ifdef __HIP_PLATFORM_NVIDIA__ +inline __device__ float4 operator*(float s, float4 a) +{ + return make_float4(a.x * s, a.y * s, a.z * s, a.w * s); +} +inline __device__ float4 operator*(float4 a, float4 b) +{ + return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} +inline __device__ float4 operator+(float4 a, float4 b) +{ + return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __device__ float4 operator-(float4 a, float4 b) +{ + return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +#endif typedef struct { double x; @@ -39,157 +57,130 @@ static coordRec coords[] = { {0.0, 0.0, 0.00001}, // All black }; - static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); __global__ void mandelbrot(uint *out, uint width, float xPos, float yPos, float xStep, float yStep, uint maxIter) { - - int tid = (blockIdx.x * blockDim.x + threadIdx.x); int i = tid % (width/4); int j = tid / (width/4); int4 veci = make_int4(4*i, 4*i+1, 4*i+2, 4*i+3); int4 vecj = make_int4(j, j, j, j); float4 x0; - x0.data[0] = (float)(xPos + xStep*veci.data[0]); - x0.data[1] = (float)(xPos + xStep*veci.data[1]); - x0.data[2] = (float)(xPos + xStep*veci.data[2]); - x0.data[3] = (float)(xPos + xStep*veci.data[3]); + x0.x = (float)(xPos + xStep*veci.x); + x0.y = (float)(xPos + xStep*veci.y); + x0.z = (float)(xPos + xStep*veci.z); + x0.w = (float)(xPos + xStep*veci.w); float4 y0; - y0.data[0] = (float)(yPos + yStep*vecj.data[0]); - y0.data[1] = (float)(yPos + yStep*vecj.data[1]); - y0.data[2] = (float)(yPos + yStep*vecj.data[2]); - y0.data[3] = (float)(yPos + yStep*vecj.data[3]); - + y0.x = (float)(yPos + yStep*vecj.x); + y0.y = (float)(yPos + yStep*vecj.y); + y0.z = (float)(yPos + yStep*vecj.z); + y0.w = (float)(yPos + yStep*vecj.w); float4 x = x0; float4 y = y0; - uint iter = 0; float4 tmp; int4 stay; int4 ccount = make_int4(0, 0, 0, 0); - float4 savx = x; float4 savy = y; - - stay.data[0] = (x.data[0]*x.data[0]+y.data[0]*y.data[0]) <= (float)(4.0f); - stay.data[1] = (x.data[1]*x.data[1]+y.data[1]*y.data[1]) <= (float)(4.0f); - stay.data[2] = (x.data[2]*x.data[2]+y.data[2]*y.data[2]) <= (float)(4.0f); - stay.data[3] = (x.data[3]*x.data[3]+y.data[3]*y.data[3]) <= (float)(4.0f); - - for (iter = 0; (stay.data[0] | stay.data[1] | stay.data[2] | stay.data[3]) && (iter < maxIter); + stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f); + stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f); + stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f); + stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f); + for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter); iter+=16) { - - x = savx; y = savy; - // Two iterations tmp = x*x + x0 - y*y; y = 2.0f * x * y + y0; x = tmp*tmp + x0 - y*y; y = 2.0f * tmp * y + y0; - // Two iterations tmp = x*x + x0 - y*y; y = 2.0f * x * y + y0; x = tmp*tmp + x0 - y*y; y = 2.0f * tmp * y + y0; - // Two iterations tmp = x*x + x0 - y*y; y = 2.0f * x * y + y0; x = tmp*tmp + x0 - y*y; y = 2.0f * tmp * y + y0; - // Two iterations tmp = x*x + x0 - y*y; y = 2.0f * x * y + y0; x = tmp*tmp + x0 - y*y; y = 2.0f * tmp * y + y0; - // Two iterations tmp = x*x + x0 - y*y; y = 2.0f * x * y + y0; x = tmp*tmp + x0 - y*y; y = 2.0f * tmp * y + y0; - // Two iterations tmp = x*x + x0 - y*y; y = 2.0f * x * y + y0; x = tmp*tmp + x0 - y*y; y = 2.0f * tmp * y + y0; - // Two iterations tmp = x*x + x0 - y*y; y = 2.0f * x * y + y0; x = tmp*tmp + x0 - y*y; y = 2.0f * tmp * y + y0; - - stay.data[0] = (x.data[0]*x.data[0]+y.data[0]*y.data[0]) <= (float)(4.0f); - stay.data[1] = (x.data[1]*x.data[1]+y.data[1]*y.data[1]) <= (float)(4.0f); - stay.data[2] = (x.data[2]*x.data[2]+y.data[2]*y.data[2]) <= (float)(4.0f); - stay.data[3] = (x.data[3]*x.data[3]+y.data[3]*y.data[3]) <= (float)(4.0f); - - savx.data[0] = (bool)(stay.data[0] ? x.data[0] : savx.data[0]); - savx.data[1] = (bool)(stay.data[1] ? x.data[1] : savx.data[1]); - savx.data[2] = (bool)(stay.data[2] ? x.data[2] : savx.data[2]); - savx.data[3] = (bool)(stay.data[3] ? x.data[3] : savx.data[3]); - - savy.data[0] = (bool)(stay.data[0] ? y.data[0] : savy.data[0]); - savy.data[1] = (bool)(stay.data[1] ? y.data[1] : savy.data[1]); - savy.data[2] = (bool)(stay.data[2] ? y.data[2] : savy.data[2]); - savy.data[3] = (bool)(stay.data[3] ? y.data[3] : savy.data[3]); - - ccount.data[0] -= stay.data[0]*16; - ccount.data[1] -= stay.data[1]*16; - ccount.data[2] -= stay.data[2]*16; - ccount.data[3] -= stay.data[3]*16; - } - - + stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f); + stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f); + stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f); + stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f); + savx.x = (bool)(stay.x ? x.x : savx.x); + savx.y = (bool)(stay.y ? x.y : savx.y); + savx.z = (bool)(stay.z ? x.z : savx.z); + savx.w = (bool)(stay.w ? x.w : savx.w); + savy.x = (bool)(stay.x ? y.x : savy.x); + savy.y = (bool)(stay.y ? y.y : savy.y); + savy.z = (bool)(stay.z ? y.z : savy.z); + savy.w = (bool)(stay.w ? y.w : savy.w); + ccount.x -= stay.x*16; + ccount.y -= stay.y*16; + ccount.z -= stay.z*16; + ccount.w -= stay.w*16; + } // Handle remainder - if (!(stay.data[0] & stay.data[1] & stay.data[2] & stay.data[3])) + if (!(stay.x & stay.y & stay.z & stay.w)) { iter = 16; do { x = savx; y = savy; - stay.x = ((x.data[0]*x.data[0]+y.data[0]*y.data[0]) <= 4.0f) && (ccount.data[0] < maxIter); - stay.y = ((x.data[1]*x.data[1]+y.data[1]*y.data[1]) <= 4.0f) && (ccount.data[1] < maxIter); - stay.z = ((x.data[2]*x.data[2]+y.data[2]*y.data[2]) <= 4.0f) && (ccount.data[2] < maxIter); - stay.w = ((x.data[3]*x.data[3]+y.data[3]*y.data[3]) <= 4.0f) && (ccount.data[3] < maxIter); + stay.x = ((x.x*x.x+y.x*y.x) <= 4.0f) && (ccount.x < maxIter); + stay.y = ((x.y*x.y+y.y*y.y) <= 4.0f) && (ccount.y < maxIter); + stay.z = ((x.z*x.z+y.z*y.z) <= 4.0f) && (ccount.z < maxIter); + stay.w = ((x.w*x.w+y.w*y.w) <= 4.0f) && (ccount.w < maxIter); tmp = x; x = x*x + x0 - y*y; y = 2.0f*tmp*y + y0; - ccount.data[0] += stay.data[0]; - ccount.data[1] += stay.data[1]; - ccount.data[2] += stay.data[2]; - ccount.data[3] += stay.data[3]; + ccount.x += stay.x; + ccount.y += stay.y; + ccount.z += stay.z; + ccount.w += stay.w; iter--; - savx.data[0] = (stay.data[0] ? x.data[0] : savx.data[0]); - savx.data[1] = (stay.data[1] ? x.data[1] : savx.data[1]); - savx.data[2] = (stay.data[2] ? x.data[2] : savx.data[2]); - savx.data[3] = (stay.data[3] ? x.data[3] : savx.data[3]); - savy.data[0] = (stay.data[0] ? y.data[0] : savy.data[0]); - savy.data[1] = (stay.data[1] ? y.data[1] : savy.data[1]); - savy.data[2] = (stay.data[2] ? y.data[2] : savy.data[2]); - savy.data[3] = (stay.data[3] ? y.data[3] : savy.data[3]); - } while ((stay.data[0] | stay.data[1] | stay.data[2] | stay.data[3]) && iter); + savx.x = (stay.x ? x.x : savx.x); + savx.y = (stay.y ? x.y : savx.y); + savx.z = (stay.z ? x.z : savx.z); + savx.w = (stay.w ? x.w : savx.w); + savy.x = (stay.x ? y.x : savy.x); + savy.y = (stay.y ? y.y : savy.y); + savy.z = (stay.z ? y.z : savy.z); + savy.w = (stay.w ? y.w : savy.w); + } while ((stay.x | stay.y | stay.z | stay.w) && iter); } - - uint4 *vecOut = (uint4 *)out; - - vecOut[tid].data[0] = (uint)(ccount.data[0]); - vecOut[tid].data[1] = (uint)(ccount.data[1]); - vecOut[tid].data[2] = (uint)(ccount.data[2]); - vecOut[tid].data[3] = (uint)(ccount.data[3]); + vecOut[tid].x = (uint)(ccount.x); + vecOut[tid].y = (uint)(ccount.y); + vecOut[tid].z = (uint)(ccount.z); + vecOut[tid].w = (uint)(ccount.w); } - class hipPerfStreamConcurrency { public: hipPerfStreamConcurrency(); @@ -307,7 +298,7 @@ void hipPerfStreamConcurrency::run(unsigned int testCase,unsigned int deviceId) // Copy memory asynchronously and concurrently from host to device for (uint i = 0; i < numKernels; i++) { - HIPCHECK(hipMemcpyHtoDAsync(dPtr[i], hPtr[i], bufSize, streams[i % numStreams])); + HIPCHECK(hipMemcpyHtoDAsync(reinterpret_cast(dPtr[i]), hPtr[i], bufSize, streams[i % numStreams])); } @@ -341,7 +332,7 @@ void hipPerfStreamConcurrency::run(unsigned int testCase,unsigned int deviceId) // Copy data back from device to the host for(uint i = 0; i < numKernels; i++) { - HIPCHECK(hipMemcpyDtoHAsync(hPtr[i] ,dPtr[i], bufSize, streams[i % numStreams])); + HIPCHECK(hipMemcpyDtoHAsync(hPtr[i], reinterpret_cast(dPtr[i]), bufSize, streams[i % numStreams])); } @@ -361,7 +352,7 @@ void hipPerfStreamConcurrency::run(unsigned int testCase,unsigned int deviceId) // Free host and device memory for (uint i = 0; i < numKernels; i++) { - HIPCHECK(hipFree(hPtr[i])); + HIPCHECK(hipHostFree(hPtr[i])); HIPCHECK(hipFree(dPtr[i])); } diff --git a/tests/performance/stream/hipPerfStreamCreateCopyDestroy.cpp b/tests/performance/stream/hipPerfStreamCreateCopyDestroy.cpp index 90292942db..103f40c7bb 100644 --- a/tests/performance/stream/hipPerfStreamCreateCopyDestroy.cpp +++ b/tests/performance/stream/hipPerfStreamCreateCopyDestroy.cpp @@ -18,7 +18,7 @@ */ /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia + * BUILD: %t %s ../../src/test_common.cpp * TEST: %t * HIT_END */