From 9035ae315434f564bf5fca6d927aff7b8dbe2bad Mon Sep 17 00:00:00 2001
From: TomSang <Tao.Sang@amd.com>
Date: Tue, 14 Sep 2021 04:07:13 -0400
Subject: [PATCH] SWDEV-299773 - Enable performance tests on NV (#2337)

1. Simply enable test on NV
   Some need minor fix
performance/compute/hipPerfDotProduct.cpp
performance/dispatch/hipPerfDispatchSpeed.cpp
performance/memory/hipPerfBufferCopyRectSpeed.cpp
performance/memory/hipPerfBufferCopySpeed.cpp
performance/memory/hipPerfDevMemReadSpeed.cpp
performance/memory/hipPerfDevMemWriteSpeed.cpp
performance/memory/hipPerfMemcpy.cpp
performance/memory/hipPerfMemset.cpp
performance/memory/hipPerfSharedMemReadSpeed.cpp
performance/stream/hipPerfDeviceConcurrency.cpp
performance/stream/hipPerfStreamCreateCopyDestroy.cpp

2. Enable and fix on NV
performance/compute/hipPerfMandelbrot.cpp
   Root cause: coordIdx is random
   Solution: Initialize coordIdx correctly
performance/memory/hipPerfMemFill.cpp
   Root cause: Hip ext Apis called.
   Solution: Exclude case with Hip ext Apis involved
performance/memory/hipPerfMemMallocCpyFree.cpp
   Root cause: Test allocates device memory more than GPU has.
   Solution: Allocate device memory in terms of GPU capacity.
tests/performance/memory/hipPerfSampleRate.cpp
   Root cause: Cuda has no operators += for float2 and float4.
   Solution: Provide the operators.
performance/stream/hipPerfStreamConcurrency.cpp
   Root cause:float4 format doesn't match cude.
              operators are missing in cuda lib.
   Solution: Use (x, y, z, w) format.
             Add necessary float4 operatoris for cuda.

Change-Id: I5add29ebabcfb21fb3ef89d09004c5d13423a291
---
 .../performance/compute/hipPerfDotProduct.cpp |   2 +-
 .../performance/compute/hipPerfMandelbrot.cpp |   8 +-
 .../dispatch/hipPerfDispatchSpeed.cpp         |   2 +-
 .../memory/hipPerfBufferCopyRectSpeed.cpp     |   2 +-
 .../memory/hipPerfBufferCopySpeed.cpp         |   2 +-
 .../memory/hipPerfDevMemReadSpeed.cpp         |   2 +-
 .../memory/hipPerfDevMemWriteSpeed.cpp        |   2 +-
 tests/performance/memory/hipPerfMemFill.cpp   |  13 +-
 .../memory/hipPerfMemMallocCpyFree.cpp        |  18 +-
 tests/performance/memory/hipPerfMemcpy.cpp    |   2 +-
 tests/performance/memory/hipPerfMemset.cpp    |   6 +-
 .../performance/memory/hipPerfSampleRate.cpp  |  21 ++-
 .../memory/hipPerfSharedMemReadSpeed.cpp      |   2 +-
 .../stream/hipPerfDeviceConcurrency.cpp       |   4 +-
 .../stream/hipPerfStreamConcurrency.cpp       | 157 +++++++++---------
 .../stream/hipPerfStreamCreateCopyDestroy.cpp |   2 +-
 16 files changed, 133 insertions(+), 112 deletions(-)

diff --git a/tests/performance/compute/hipPerfDotProduct.cpp b/tests/performance/compute/hipPerfDotProduct.cpp
index e1d444d5b5..3b2e0c72dd 100644
--- a/tests/performance/compute/hipPerfDotProduct.cpp
+++ b/tests/performance/compute/hipPerfDotProduct.cpp
@@ -18,7 +18,7 @@
  */
 
 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../../src/test_common.cpp
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/performance/compute/hipPerfMandelbrot.cpp b/tests/performance/compute/hipPerfMandelbrot.cpp
index 8e6c3a25cc..9f9d6b404e 100644
--- a/tests/performance/compute/hipPerfMandelbrot.cpp
+++ b/tests/performance/compute/hipPerfMandelbrot.cpp
@@ -18,7 +18,7 @@
  */
 
 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../../src/test_common.cpp
  * TEST: %t
  * HIT_END
  */
@@ -489,6 +489,7 @@ void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos,
 void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
 
   unsigned int numStreams = getNumStreams();
+  coordIdx = testCase % numCoords;
 
   funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
                &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
@@ -555,9 +556,6 @@ void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
   double totalTime = 0.0;
 
   for (unsigned int k = 0; k < numLoops; k++) {
-
-  coordIdx = testCase % numCoords;
-
   if ((testCase == 0 || testCase == 1 || testCase == 2 ||
                   testCase == 5 || testCase == 6 || testCase == 7 ||
                   testCase == 10 || testCase == 11 || testCase == 12)) {
@@ -653,7 +651,7 @@ void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
 
   // Free host and device memory
   for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipFree(hPtr[i]));
+    HIPCHECK(hipHostFree(hPtr[i]));
     HIPCHECK(hipFree(dPtr[i]));
   }
 
diff --git a/tests/performance/dispatch/hipPerfDispatchSpeed.cpp b/tests/performance/dispatch/hipPerfDispatchSpeed.cpp
index 74d77f2113..d3a2c8c4ad 100644
--- a/tests/performance/dispatch/hipPerfDispatchSpeed.cpp
+++ b/tests/performance/dispatch/hipPerfDispatchSpeed.cpp
@@ -18,7 +18,7 @@
  */
 
 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/performance/memory/hipPerfBufferCopyRectSpeed.cpp b/tests/performance/memory/hipPerfBufferCopyRectSpeed.cpp
index 7fd10673b9..78096844f5 100644
--- a/tests/performance/memory/hipPerfBufferCopyRectSpeed.cpp
+++ b/tests/performance/memory/hipPerfBufferCopyRectSpeed.cpp
@@ -18,7 +18,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/performance/memory/hipPerfBufferCopySpeed.cpp b/tests/performance/memory/hipPerfBufferCopySpeed.cpp
index 0a046fa8c0..eda9a80230 100644
--- a/tests/performance/memory/hipPerfBufferCopySpeed.cpp
+++ b/tests/performance/memory/hipPerfBufferCopySpeed.cpp
@@ -18,7 +18,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/performance/memory/hipPerfDevMemReadSpeed.cpp b/tests/performance/memory/hipPerfDevMemReadSpeed.cpp
index 408a1949b4..6548da94cd 100644
--- a/tests/performance/memory/hipPerfDevMemReadSpeed.cpp
+++ b/tests/performance/memory/hipPerfDevMemReadSpeed.cpp
@@ -18,7 +18,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../../src/test_common.cpp
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/performance/memory/hipPerfDevMemWriteSpeed.cpp b/tests/performance/memory/hipPerfDevMemWriteSpeed.cpp
index 21b19ff1de..cc48836604 100644
--- a/tests/performance/memory/hipPerfDevMemWriteSpeed.cpp
+++ b/tests/performance/memory/hipPerfDevMemWriteSpeed.cpp
@@ -18,7 +18,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../../src/test_common.cpp
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/performance/memory/hipPerfMemFill.cpp b/tests/performance/memory/hipPerfMemFill.cpp
index 0dbae58ce4..dd54ec6855 100644
--- a/tests/performance/memory/hipPerfMemFill.cpp
+++ b/tests/performance/memory/hipPerfMemFill.cpp
@@ -18,7 +18,7 @@
  */
 
 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../../src/test_common.cpp
  * TEST: %t
  * HIT_END
  */
@@ -136,6 +136,7 @@ class hipPerfMemFill {
     }
 
     HIPCHECK(hipSetDevice(deviceId));
+    memset(&props_, 0, sizeof(props_));
     HIPCHECK(hipGetDeviceProperties(&props_, deviceId));
     std::cout << "Info: running on device: id: " << deviceId << ", bus: 0x"
         << props_.pciBusID << " " << props_.name << " with "
@@ -397,8 +398,9 @@ class hipPerfMemFill {
     return true;
   }
 
-  /* This fuction should be via device attribute query*/
+  /* This function should be via device attribute query*/
   bool supportDeviceMallocFinegrained() {
+#ifdef __HIP_PLATFORM_AMD__
     T *A = nullptr;
     hipExtMallocWithFlags((void **)&A, sizeof(T), hipDeviceMallocFinegrained);
     if (!A) {
@@ -406,6 +408,9 @@ class hipPerfMemFill {
     }
     HIPCHECK(hipFree(A));
     return true;
+#else
+    return false;
+#endif
   }
 
   unsigned int setNumBlocks(size_t size) {
@@ -419,6 +424,7 @@ class hipPerfMemFill {
 #endif
   }
 
+#ifdef __HIP_PLATFORM_AMD__
   bool testExtDeviceMemoryHostFill(size_t size, unsigned int flags) {
     double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0);
 
@@ -481,6 +487,7 @@ class hipPerfMemFill {
 
     return true;
   }
+#endif
 
   bool run() {
     if (supportLargeBar()) {
@@ -499,11 +506,13 @@ class hipPerfMemFill {
       return false;
     }
 
+#ifdef __HIP_PLATFORM_AMD__
     if (supportDeviceMallocFinegrained()) {
       if (!testExtDeviceMemory()) {
         return false;
       }
     }
+#endif
     return true;
   }
 
diff --git a/tests/performance/memory/hipPerfMemMallocCpyFree.cpp b/tests/performance/memory/hipPerfMemMallocCpyFree.cpp
index 970d18a56d..94ceb68cba 100644
--- a/tests/performance/memory/hipPerfMemMallocCpyFree.cpp
+++ b/tests/performance/memory/hipPerfMemMallocCpyFree.cpp
@@ -22,7 +22,7 @@ THE SOFTWARE.
 #include <time.h>
 
 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../../src/test_common.cpp
  * TEST: %t
  * HIT_END
  */
@@ -37,10 +37,15 @@ void valSet(int* A, int val, size_t size) {
     }
 }
 
-void setup(size_t *size, const int num, int **pA) {
+void setup(size_t *size, int &num, int **pA, const size_t totalGlobalMem) {
+
     std::cout << "size: ";
     for (int i = 0; i < num; i++) {
         size[i] = 1 << (i + 6);
+        if((NUM_ITER + 1) * size[i] > totalGlobalMem) {
+          num = i;
+          break;
+        }
         std::cout << size[i] << " ";
     }
     std::cout << std::endl;
@@ -77,11 +82,16 @@ int main() {
     size_t size[NUM_SIZE] = { 0 };
     int *Ad[NUM_ITER] = { nullptr };
     int *A;
+    hipDeviceProp_t props;
+    memset(&props, 0, sizeof(props));
+    HIPCHECK(hipGetDeviceProperties(&props, 0));
+    std::cout << "totalGlobalMem: " << props.totalGlobalMem << std::endl;
 
-    setup(size, NUM_SIZE, &A);
+    int num = NUM_SIZE;
+    setup(size, num, &A, props.totalGlobalMem);
     testInit(size[0], A);
 
-    for (int i = 0; i < NUM_SIZE; i++) {
+    for (int i = 0; i < num; i++) {
         std::cout << size[i] << std::endl;
         start = clock();
         for (int j = 0; j < NUM_ITER; j++) {
diff --git a/tests/performance/memory/hipPerfMemcpy.cpp b/tests/performance/memory/hipPerfMemcpy.cpp
index c9dfedc99e..9751117eca 100644
--- a/tests/performance/memory/hipPerfMemcpy.cpp
+++ b/tests/performance/memory/hipPerfMemcpy.cpp
@@ -18,7 +18,7 @@
  */
 
 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../../src/test_common.cpp
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/performance/memory/hipPerfMemset.cpp b/tests/performance/memory/hipPerfMemset.cpp
index 4d6ae88711..25b62699be 100644
--- a/tests/performance/memory/hipPerfMemset.cpp
+++ b/tests/performance/memory/hipPerfMemset.cpp
@@ -18,7 +18,7 @@
  */
 
 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../../src/test_common.cpp
  * TEST: %t
  * HIT_END
  */
@@ -140,13 +140,13 @@ void hipPerfMemset::run1D(unsigned int test, T memsetval, enum MemsetType type,
   HIPCHECK(hipStreamCreate(&stream));
 
   // Warm-up
-  HIPCHECK(hipMemset((hipDeviceptr_t)A_d, memsetval, bufSize_));
+  HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_));
 
   auto start = chrono::steady_clock::now();
 
   for (uint i = 0; i < NUM_ITER; i++) {
     if (type == hipMemsetTypeDefault && !async) {
-      HIPCHECK(hipMemset((hipDeviceptr_t)A_d, memsetval, bufSize_));
+      HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_));
     }
     else if (type == hipMemsetTypeDefault && async) {
       HIPCHECK(hipMemsetAsync(A_d, memsetval, bufSize_, stream));
diff --git a/tests/performance/memory/hipPerfSampleRate.cpp b/tests/performance/memory/hipPerfSampleRate.cpp
index 1cb0741158..1ecadfe749 100644
--- a/tests/performance/memory/hipPerfSampleRate.cpp
+++ b/tests/performance/memory/hipPerfSampleRate.cpp
@@ -18,7 +18,7 @@
  */
 
 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../../src/test_common.cpp
  * TEST: %t
  * HIT_END
  */
@@ -42,6 +42,18 @@ vector<unsigned int> sizes = {1,  2,   4,   8,   16,   32,
 #define NUM_BUFS 6
 #define MAX_BUFS (1 << (NUM_BUFS - 1))
 
+#ifdef __HIP_PLATFORM_NVIDIA__
+inline __host__ __device__ void operator+=(float2 &a, float2 b)
+{
+  a.x += b.x; a.y += b.y;
+}
+
+inline __host__ __device__ void operator+=(float4 &a, float4 b)
+{
+  a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
+}
+#endif
+
 template <typename T>
 __global__ void sampleRate(T * outBuffer, unsigned int inBufSize, unsigned int writeIt,
                            T **inBuffer, int numBufs) {
@@ -49,7 +61,8 @@ __global__ void sampleRate(T * outBuffer, unsigned int inBufSize, unsigned int w
   uint gid = (blockIdx.x * blockDim.x + threadIdx.x);
   uint inputIdx = gid % inBufSize;
 
-  T tmp = (T)0.0f;
+  T tmp;
+  memset(&tmp, 0, sizeof(T));
   for(int i = 0; i < numBufs; i++) {
     tmp += *(*(inBuffer+i)+inputIdx);
   }
@@ -264,11 +277,11 @@ void hipPerfSampleRate::run(unsigned int test) {
 
    // Free host and device memory
    for (uint i = 0; i < numBufs_; i++) {
-    HIPCHECK(hipFree(hInPtr[i]));
+    HIPCHECK(hipHostFree(hInPtr[i]));
     HIPCHECK(hipFree(dInPtr[i]));
     }
 
-   HIPCHECK(hipFree(hOutPtr));
+   HIPCHECK(hipHostFree(hOutPtr));
    HIPCHECK(hipFree(dPtr));
 }
 
diff --git a/tests/performance/memory/hipPerfSharedMemReadSpeed.cpp b/tests/performance/memory/hipPerfSharedMemReadSpeed.cpp
index 4cdabd9303..539cf4105b 100644
--- a/tests/performance/memory/hipPerfSharedMemReadSpeed.cpp
+++ b/tests/performance/memory/hipPerfSharedMemReadSpeed.cpp
@@ -18,7 +18,7 @@
  */
 
 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../../src/test_common.cpp
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/performance/stream/hipPerfDeviceConcurrency.cpp b/tests/performance/stream/hipPerfDeviceConcurrency.cpp
index 0c1a1ec6dc..664bdb47ed 100644
--- a/tests/performance/stream/hipPerfDeviceConcurrency.cpp
+++ b/tests/performance/stream/hipPerfDeviceConcurrency.cpp
@@ -18,7 +18,7 @@
  */
 
 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../../src/test_common.cpp
  * TEST: %t
  * HIT_END
  */
@@ -226,7 +226,7 @@ void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
   HIPCHECK(hipStreamDestroy(streams[i]));
 
   // Free host and device memory
-  HIPCHECK(hipFree(hPtr[i]));
+  HIPCHECK(hipHostFree(hPtr[i]));
   HIPCHECK(hipFree(dPtr[i]));
   }
 
diff --git a/tests/performance/stream/hipPerfStreamConcurrency.cpp b/tests/performance/stream/hipPerfStreamConcurrency.cpp
index 557ad7823c..16e29bc06b 100644
--- a/tests/performance/stream/hipPerfStreamConcurrency.cpp
+++ b/tests/performance/stream/hipPerfStreamConcurrency.cpp
@@ -18,7 +18,7 @@
  */
 
 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../../src/test_common.cpp
  * TEST: %t
  * HIT_END
  */
@@ -28,6 +28,24 @@
 #include "test_common.h"
 #include <hip/hip_vector_types.h>
 
+#ifdef __HIP_PLATFORM_NVIDIA__
+inline __device__ float4 operator*(float s, float4 a)
+{
+  return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
+}
+inline __device__ float4 operator*(float4 a, float4 b)
+{
+  return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+}
+inline __device__ float4 operator+(float4 a, float4 b)
+{
+  return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+inline __device__ float4 operator-(float4 a, float4 b)
+{
+  return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+}
+#endif
 
 typedef struct {
   double x;
@@ -39,157 +57,130 @@ static coordRec coords[] = {
     {0.0, 0.0, 0.00001},  // All black
 };
 
-
 static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
 
 __global__ void mandelbrot(uint *out, uint width, float xPos, float yPos,
          float xStep, float yStep, uint maxIter) {
-
-
   int tid = (blockIdx.x * blockDim.x + threadIdx.x);
   int i = tid % (width/4);
   int j = tid / (width/4);
   int4 veci = make_int4(4*i, 4*i+1, 4*i+2, 4*i+3);
   int4 vecj = make_int4(j, j, j, j);
   float4 x0;
-  x0.data[0] = (float)(xPos + xStep*veci.data[0]);
-  x0.data[1] = (float)(xPos + xStep*veci.data[1]);
-  x0.data[2] = (float)(xPos + xStep*veci.data[2]);
-  x0.data[3] = (float)(xPos + xStep*veci.data[3]);
+  x0.x = (float)(xPos + xStep*veci.x);
+  x0.y = (float)(xPos + xStep*veci.y);
+  x0.z = (float)(xPos + xStep*veci.z);
+  x0.w = (float)(xPos + xStep*veci.w);
   float4 y0;
-  y0.data[0] = (float)(yPos + yStep*vecj.data[0]);
-  y0.data[1] = (float)(yPos + yStep*vecj.data[1]);
-  y0.data[2] = (float)(yPos + yStep*vecj.data[2]);
-  y0.data[3] = (float)(yPos + yStep*vecj.data[3]);
-
+  y0.x = (float)(yPos + yStep*vecj.x);
+  y0.y = (float)(yPos + yStep*vecj.y);
+  y0.z = (float)(yPos + yStep*vecj.z);
+  y0.w = (float)(yPos + yStep*vecj.w);
   float4 x = x0;
   float4 y = y0;
-
   uint iter = 0;
   float4 tmp;
   int4 stay;
   int4 ccount = make_int4(0, 0, 0, 0);
-
   float4 savx = x;
   float4 savy = y;
-
-  stay.data[0] = (x.data[0]*x.data[0]+y.data[0]*y.data[0]) <= (float)(4.0f);
-  stay.data[1] = (x.data[1]*x.data[1]+y.data[1]*y.data[1]) <= (float)(4.0f);
-  stay.data[2] = (x.data[2]*x.data[2]+y.data[2]*y.data[2]) <= (float)(4.0f);
-  stay.data[3] = (x.data[3]*x.data[3]+y.data[3]*y.data[3]) <= (float)(4.0f);
-
-  for (iter = 0; (stay.data[0] | stay.data[1] | stay.data[2] | stay.data[3]) && (iter < maxIter);
+  stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f);
+  stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f);
+  stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f);
+  stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f);
+  for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter);
   iter+=16) {
-
-
     x = savx;
     y = savy;
-
     // Two iterations
     tmp = x*x + x0 - y*y;
     y = 2.0f * x * y + y0;
     x = tmp*tmp + x0 - y*y;
     y = 2.0f * tmp * y + y0;
-
     // Two iterations
     tmp = x*x + x0 - y*y;
     y = 2.0f * x * y + y0;
     x = tmp*tmp + x0 - y*y;
     y = 2.0f * tmp * y + y0;
-
     // Two iterations
     tmp = x*x + x0 - y*y;
     y = 2.0f * x * y + y0;
     x = tmp*tmp + x0 - y*y;
     y = 2.0f * tmp * y + y0;
-
     // Two iterations
     tmp = x*x + x0 - y*y;
     y = 2.0f * x * y + y0;
     x = tmp*tmp + x0 - y*y;
     y = 2.0f * tmp * y + y0;
-
     // Two iterations
     tmp = x*x + x0 - y*y;
     y = 2.0f * x * y + y0;
     x = tmp*tmp + x0 - y*y;
     y = 2.0f * tmp * y + y0;
-
     // Two iterations
     tmp = x*x + x0 - y*y;
     y = 2.0f * x * y + y0;
     x = tmp*tmp + x0 - y*y;
     y = 2.0f * tmp * y + y0;
-
     // Two iterations
     tmp = x*x + x0 - y*y;
     y = 2.0f * x * y + y0;
     x = tmp*tmp + x0 - y*y;
     y = 2.0f * tmp * y + y0;
-
-    stay.data[0] = (x.data[0]*x.data[0]+y.data[0]*y.data[0]) <= (float)(4.0f);
-    stay.data[1] = (x.data[1]*x.data[1]+y.data[1]*y.data[1]) <= (float)(4.0f);
-    stay.data[2] = (x.data[2]*x.data[2]+y.data[2]*y.data[2]) <= (float)(4.0f);
-    stay.data[3] = (x.data[3]*x.data[3]+y.data[3]*y.data[3]) <= (float)(4.0f);
-
-    savx.data[0] = (bool)(stay.data[0] ? x.data[0] : savx.data[0]);
-    savx.data[1] = (bool)(stay.data[1] ? x.data[1] : savx.data[1]);
-    savx.data[2] = (bool)(stay.data[2] ? x.data[2] : savx.data[2]);
-    savx.data[3] = (bool)(stay.data[3] ? x.data[3] : savx.data[3]);
-
-    savy.data[0] = (bool)(stay.data[0] ? y.data[0] : savy.data[0]);
-    savy.data[1] = (bool)(stay.data[1] ? y.data[1] : savy.data[1]);
-    savy.data[2] = (bool)(stay.data[2] ? y.data[2] : savy.data[2]);
-    savy.data[3] = (bool)(stay.data[3] ? y.data[3] : savy.data[3]);
-
-    ccount.data[0] -= stay.data[0]*16;
-    ccount.data[1] -= stay.data[1]*16;
-    ccount.data[2] -= stay.data[2]*16;
-    ccount.data[3] -= stay.data[3]*16;
-    }
-
-
+    stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f);
+    stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f);
+    stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f);
+    stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f);
+    savx.x = (bool)(stay.x ? x.x : savx.x);
+    savx.y = (bool)(stay.y ? x.y : savx.y);
+    savx.z = (bool)(stay.z ? x.z : savx.z);
+    savx.w = (bool)(stay.w ? x.w : savx.w);
+    savy.x = (bool)(stay.x ? y.x : savy.x);
+    savy.y = (bool)(stay.y ? y.y : savy.y);
+    savy.z = (bool)(stay.z ? y.z : savy.z);
+    savy.w = (bool)(stay.w ? y.w : savy.w);
+    ccount.x -= stay.x*16;
+    ccount.y -= stay.y*16;
+    ccount.z -= stay.z*16;
+    ccount.w -= stay.w*16;
+  }
   // Handle remainder
-  if (!(stay.data[0] & stay.data[1] & stay.data[2] & stay.data[3]))
+  if (!(stay.x & stay.y & stay.z & stay.w))
   {
     iter = 16;
     do
     {
       x = savx;
       y = savy;
-      stay.x = ((x.data[0]*x.data[0]+y.data[0]*y.data[0]) <= 4.0f) && (ccount.data[0] <  maxIter);
-      stay.y = ((x.data[1]*x.data[1]+y.data[1]*y.data[1]) <= 4.0f) && (ccount.data[1] <  maxIter);
-      stay.z = ((x.data[2]*x.data[2]+y.data[2]*y.data[2]) <= 4.0f) && (ccount.data[2] <  maxIter);
-      stay.w = ((x.data[3]*x.data[3]+y.data[3]*y.data[3]) <= 4.0f) && (ccount.data[3] <  maxIter);
+      stay.x = ((x.x*x.x+y.x*y.x) <= 4.0f) && (ccount.x <  maxIter);
+      stay.y = ((x.y*x.y+y.y*y.y) <= 4.0f) && (ccount.y <  maxIter);
+      stay.z = ((x.z*x.z+y.z*y.z) <= 4.0f) && (ccount.z <  maxIter);
+      stay.w = ((x.w*x.w+y.w*y.w) <= 4.0f) && (ccount.w <  maxIter);
       tmp = x;
       x = x*x + x0 - y*y;
       y = 2.0f*tmp*y + y0;
-      ccount.data[0] += stay.data[0];
-      ccount.data[1] += stay.data[1];
-      ccount.data[2] += stay.data[2];
-      ccount.data[3] += stay.data[3];
+      ccount.x += stay.x;
+      ccount.y += stay.y;
+      ccount.z += stay.z;
+      ccount.w += stay.w;
       iter--;
-      savx.data[0] = (stay.data[0] ? x.data[0] : savx.data[0]);
-      savx.data[1] = (stay.data[1] ? x.data[1] : savx.data[1]);
-      savx.data[2] = (stay.data[2] ? x.data[2] : savx.data[2]);
-      savx.data[3] = (stay.data[3] ? x.data[3] : savx.data[3]);
-      savy.data[0] = (stay.data[0] ? y.data[0] : savy.data[0]);
-      savy.data[1] = (stay.data[1] ? y.data[1] : savy.data[1]);
-      savy.data[2] = (stay.data[2] ? y.data[2] : savy.data[2]);
-      savy.data[3] = (stay.data[3] ? y.data[3] : savy.data[3]);
-    } while ((stay.data[0] | stay.data[1] | stay.data[2] | stay.data[3]) && iter);
+      savx.x = (stay.x ? x.x : savx.x);
+      savx.y = (stay.y ? x.y : savx.y);
+      savx.z = (stay.z ? x.z : savx.z);
+      savx.w = (stay.w ? x.w : savx.w);
+      savy.x = (stay.x ? y.x : savy.x);
+      savy.y = (stay.y ? y.y : savy.y);
+      savy.z = (stay.z ? y.z : savy.z);
+      savy.w = (stay.w ? y.w : savy.w);
+    } while ((stay.x | stay.y | stay.z | stay.w) && iter);
   }
-
-
   uint4 *vecOut = (uint4 *)out;
-
-  vecOut[tid].data[0] = (uint)(ccount.data[0]);
-  vecOut[tid].data[1] = (uint)(ccount.data[1]);
-  vecOut[tid].data[2] = (uint)(ccount.data[2]);
-  vecOut[tid].data[3] = (uint)(ccount.data[3]);
+  vecOut[tid].x = (uint)(ccount.x);
+  vecOut[tid].y = (uint)(ccount.y);
+  vecOut[tid].z = (uint)(ccount.z);
+  vecOut[tid].w = (uint)(ccount.w);
 }
 
-
 class hipPerfStreamConcurrency {
   public:
   hipPerfStreamConcurrency();
@@ -307,7 +298,7 @@ void hipPerfStreamConcurrency::run(unsigned int testCase,unsigned int deviceId)
 
   // Copy memory asynchronously and concurrently from host to device
   for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipMemcpyHtoDAsync(dPtr[i], hPtr[i], bufSize, streams[i % numStreams]));
+    HIPCHECK(hipMemcpyHtoDAsync(reinterpret_cast<hipDeviceptr_t>(dPtr[i]), hPtr[i], bufSize, streams[i % numStreams]));
   }
 
 
@@ -341,7 +332,7 @@ void hipPerfStreamConcurrency::run(unsigned int testCase,unsigned int deviceId)
 
   // Copy data back from device to the host
   for(uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipMemcpyDtoHAsync(hPtr[i] ,dPtr[i], bufSize, streams[i % numStreams]));
+    HIPCHECK(hipMemcpyDtoHAsync(hPtr[i], reinterpret_cast<hipDeviceptr_t>(dPtr[i]), bufSize, streams[i % numStreams]));
   }
 
 
@@ -361,7 +352,7 @@ void hipPerfStreamConcurrency::run(unsigned int testCase,unsigned int deviceId)
 
   // Free host and device memory
   for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipFree(hPtr[i]));
+    HIPCHECK(hipHostFree(hPtr[i]));
     HIPCHECK(hipFree(dPtr[i]));
   }
 
diff --git a/tests/performance/stream/hipPerfStreamCreateCopyDestroy.cpp b/tests/performance/stream/hipPerfStreamCreateCopyDestroy.cpp
index 90292942db..103f40c7bb 100644
--- a/tests/performance/stream/hipPerfStreamCreateCopyDestroy.cpp
+++ b/tests/performance/stream/hipPerfStreamCreateCopyDestroy.cpp
@@ -18,7 +18,7 @@
  */
 
 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * BUILD: %t %s ../../src/test_common.cpp
  * TEST: %t
  * HIT_END
  */