SWDEV-546217 Complete hip-test Port to Catch2 Framework [Stream and Compute Folder] (#559)

* SWDEV-546498 hipPerfDeviceConcurrency * SWDEV-546500 hipPerfStreamConcurrency * SWDEV-546502 hipPerfStreamCreateCopyDestroy.c * SWDEV-546479 hipPerfDotProduct * SWDEV-546482 hipPerfMandelbrot [ROCm/hip-tests commit: 9fdc9a98b7]
2025-08-15 15:38:33 -04:00
@@ -18,10 +18,10 @@
 */

 /**
-* @addtogroup hipPerfDotProduct hipPerfDotProduct
-* @{
-* @ingroup perfComputeTest
-*/
+ * @addtogroup hipPerfDotProduct hipPerfDotProduct
+ * @{
+ * @ingroup perfComputeTest
+ */

 #include <hip_test_common.hh>
 #include <vector>
@@ -31,11 +31,9 @@
 using namespace std;

 template <unsigned int BLOCKSIZE>
-__launch_bounds__(BLOCKSIZE)
-__global__ void vectors_not_equal(int n,
-                                 const double* __restrict__ x,
-                                 const double* __restrict__ y,
-                                 double* __restrict__ workspace) {
+__launch_bounds__(BLOCKSIZE) __global__
+    void vectors_not_equal(int n, const double* __restrict__ x, const double* __restrict__ y,
+                           double* __restrict__ workspace) {
  int gid = blockIdx.x * blockDim.x + threadIdx.x;

  double sum = 0.0;
@@ -93,9 +91,8 @@ __global__ void vectors_not_equal(int n,
 }

 template <unsigned int BLOCKSIZE>
-__launch_bounds__(BLOCKSIZE)
-__global__ void vectors_equal(int n, const double* __restrict__ x,
-                                  double* __restrict__ workspace) {
+__launch_bounds__(BLOCKSIZE) __global__
+    void vectors_equal(int n, const double* __restrict__ x, double* __restrict__ workspace) {
  int gid = blockIdx.x * blockDim.x + threadIdx.x;

  double sum = 0.0;
@@ -129,7 +126,7 @@ __global__ void vectors_equal(int n, const double* __restrict__ x,
  __syncthreads();

  if (threadIdx.x < 8) {
-          sdata[threadIdx.x] += sdata[threadIdx.x + 8];
+    sdata[threadIdx.x] += sdata[threadIdx.x + 8];
  }
  __syncthreads();

@@ -149,12 +146,11 @@ __global__ void vectors_equal(int n, const double* __restrict__ x,

  if (threadIdx.x == 0) {
    workspace[blockIdx.x] = sdata[0];
-    }
+  }
 }

 template <unsigned int BLOCKSIZE>
-__launch_bounds__(BLOCKSIZE)
-__global__ void dot_reduction(double* __restrict__ workspace) {
+__launch_bounds__(BLOCKSIZE) __global__ void dot_reduction(double* __restrict__ workspace) {
  __shared__ double sdata[BLOCKSIZE];

  sdata[threadIdx.x] = workspace[threadIdx.x];
@@ -187,7 +183,8 @@ __global__ void dot_reduction(double* __restrict__ workspace) {

  if (threadIdx.x < 4) {
    sdata[threadIdx.x] += sdata[threadIdx.x + 4];
-  } __syncthreads();
+  }
+  __syncthreads();

  if (threadIdx.x < 2) {
    sdata[threadIdx.x] += sdata[threadIdx.x + 2];
@@ -203,8 +200,7 @@ __global__ void dot_reduction(double* __restrict__ workspace) {
  }
 }

-void computeDotProduct(int n, const double* x, const double* y, double& result,
-                      double* workspace) {
+void computeDotProduct(int n, const double* x, const double* y, double& result, double* workspace) {
  dim3 blocks(DOT_DIM);
  dim3 threadsPerBlock(DOT_DIM);

@@ -225,16 +221,16 @@ void computeDotProduct(int n, const double* x, const double* y, double& result,
 }

 /**
-* Test Description
-* ------------------------
-*  - Verify the device kernel results comparing it with the host results.
-* Test source
-* ------------------------
-*  - perftests/compute/hipPerfDotProduct.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify the device kernel results comparing it with the host results.
+ * Test source
+ * ------------------------
+ *  - perftests/compute/hipPerfDotProduct.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */

 TEST_CASE("Perf_hipPerfDotProduct") {
  int nGpu = 0;
@@ -252,120 +248,120 @@ TEST_CASE("Perf_hipPerfDotProduct") {
  for (unsigned int testCase = 0; testCase < 3; testCase++) {
    vector<int> vectorSize = {200, 300, 50};
    switch (testCase) {
-    case 0:
-    nx = vectorSize[0];
-    ny = vectorSize[0];
-    nz = vectorSize[0];
-    break;
+      case 0:
+        nx = vectorSize[0];
+        ny = vectorSize[0];
+        nz = vectorSize[0];
+        break;

-    case 1:
-    nx = vectorSize[1];
-    ny = vectorSize[1];
-    nz = vectorSize[1];
-    break;
+      case 1:
+        nx = vectorSize[1];
+        ny = vectorSize[1];
+        nz = vectorSize[1];
+        break;

-    case 2:
-    nx = vectorSize[0];
-    ny = vectorSize[1];
-    nz = vectorSize[2];
-    break;
+      case 2:
+        nx = vectorSize[0];
+        ny = vectorSize[1];
+        nz = vectorSize[2];
+        break;

-    default:
-      break;
-  }
+      default:
+        break;
+    }

-  int trials = 200;
-  int size = nx * ny * nz;
+    int trials = 200;
+    int size = nx * ny * nz;

-  vector<double> hx(size);
-  vector<double> hy(size);
-  double hresult_xy = 0.0;
-  double hresult_xx = 0.0;
+    vector<double> hx(size);
+    vector<double> hy(size);
+    double hresult_xy = 0.0;
+    double hresult_xx = 0.0;

-  srand(time(NULL));
+    srand(time(NULL));

-  for (int i = 0; i < size; ++i) {
-    hx[i] = 2.0 * static_cast<double>(rand()) / static_cast<double>(RAND_MAX) - 1.0;
-    hy[i] = 2.0 * static_cast<double>(rand()) / static_cast<double>(RAND_MAX) - 1.0;
+    for (int i = 0; i < size; ++i) {
+      hx[i] = 2.0 * static_cast<double>(rand()) / static_cast<double>(RAND_MAX) - 1.0;
+      hy[i] = 2.0 * static_cast<double>(rand()) / static_cast<double>(RAND_MAX) - 1.0;

-    hresult_xy += hx[i] * hy[i];
-    hresult_xx += hx[i] * hx[i];
-  }
+      hresult_xy += hx[i] * hy[i];
+      hresult_xx += hx[i] * hx[i];
+    }

-  double* dx;
-  double* dy;
-  double* workspace;
-  double  dresult;
+    double* dx;
+    double* dy;
+    double* workspace;
+    double dresult;

-  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dx), sizeof(double) * size));
-  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dy), sizeof(double) * size));
-  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&workspace), sizeof(double) * DOT_DIM));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dx), sizeof(double) * size));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dy), sizeof(double) * size));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&workspace), sizeof(double) * DOT_DIM));

-  HIP_CHECK(hipMemcpy(dx, hx.data(), sizeof(double) * size, hipMemcpyHostToDevice));
-  HIP_CHECK(hipMemcpy(dy, hy.data(), sizeof(double) * size, hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(dx, hx.data(), sizeof(double) * size, hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(dy, hy.data(), sizeof(double) * size, hipMemcpyHostToDevice));

-  // Warm up
-  computeDotProduct(size, dx, dy, dresult, workspace);
-  computeDotProduct(size, dx, dy, dresult, workspace);
-  computeDotProduct(size, dx, dy, dresult, workspace);
-
-  // Timed run for <x,y>
-  HIP_CHECK(hipDeviceSynchronize());
-  auto all_start = std::chrono::steady_clock::now();
-
-  for (int i = 0; i < trials; ++i) {
+    // Warm up
+    computeDotProduct(size, dx, dy, dresult, workspace);
+    computeDotProduct(size, dx, dy, dresult, workspace);
    computeDotProduct(size, dx, dy, dresult, workspace);
-  }

-  float time = 0;
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-  time = all_kernel_time.count();
+    // Timed run for <x,y>
+    HIP_CHECK(hipDeviceSynchronize());
+    auto all_start = std::chrono::steady_clock::now();

-  time /= trials;
+    for (int i = 0; i < trials; ++i) {
+      computeDotProduct(size, dx, dy, dresult, workspace);
+    }

-  double bw = sizeof(double) * size * 2.0 / 1e9;
-  double gf = 2.0 * size / 1e9;
+    float time = 0;
+    auto all_end = std::chrono::steady_clock::now();
+    std::chrono::duration<double> all_kernel_time = all_end - all_start;
+    time = all_kernel_time.count();

-  cout << "\nVector Size: " << size << "\n[ddot] <x,y> " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;"
-       << gf/(time / 1e3) << " GFlop/s" << endl;
+    time /= trials;

-  // Verify the device kernel results comparing it with the host results
-  REQUIRE(std::abs(dresult - hresult_xy) < std::max(dresult * 1e-10, 1e-8));
+    double bw = sizeof(double) * size * 2.0 / 1e9;
+    double gf = 2.0 * size / 1e9;

-  // Warm up
-  computeDotProduct(size, dx, dx, dresult, workspace);
-  computeDotProduct(size, dx, dx, dresult, workspace);
-  computeDotProduct(size, dx, dx, dresult, workspace);
+    CONSOLE_PRINT("\nVector Size: %d\n[ddot] <x,y> %.6f msec ; %.6f GByte/s ; %.6f GFlop/s", size,
+                  time, bw / (time / 1e3), gf / (time / 1e3));

-  // Timed run for <x,x>
-  HIP_CHECK(hipDeviceSynchronize());
-  all_start = std::chrono::steady_clock::now();
+    // Verify the device kernel results comparing it with the host results
+    REQUIRE(std::abs(dresult - hresult_xy) < std::max(dresult * 1e-10, 1e-8));

-  for (int i = 0; i < trials; ++i) {
+    // Warm up
+    computeDotProduct(size, dx, dx, dresult, workspace);
+    computeDotProduct(size, dx, dx, dresult, workspace);
    computeDotProduct(size, dx, dx, dresult, workspace);
-  }

-  all_end = std::chrono::steady_clock::now();
-  all_kernel_time = all_end - all_start;
-  time = all_kernel_time.count();
+    // Timed run for <x,x>
+    HIP_CHECK(hipDeviceSynchronize());
+    all_start = std::chrono::steady_clock::now();

-  time /= trials;
-  bw = sizeof(double) * size / 1e9;
+    for (int i = 0; i < trials; ++i) {
+      computeDotProduct(size, dx, dx, dresult, workspace);
+    }

-  cout << "[ddot] <x,y> " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;"
-       << gf/(time / 1e3) << " GFlop/s" << endl;
+    all_end = std::chrono::steady_clock::now();
+    all_kernel_time = all_end - all_start;
+    time = all_kernel_time.count();

-  // Verify the device kernel results comparing it with the host results
-  REQUIRE(abs(dresult - hresult_xx) < max(dresult * 1e-10, 1e-8));
+    time /= trials;
+    bw = sizeof(double) * size / 1e9;

-  HIP_CHECK(hipFree(dx));
-  HIP_CHECK(hipFree(dy));
-  HIP_CHECK(hipFree(workspace));
+    CONSOLE_PRINT("[ddot] <x,y> %.6f msec ; %.6f GByte/s ; %.6f GFlop/s", time, bw / (time / 1e3),
+                  gf / (time / 1e3));
+
+    // Verify the device kernel results comparing it with the host results
+    REQUIRE(abs(dresult - hresult_xx) < max(dresult * 1e-10, 1e-8));
+
+    HIP_CHECK(hipFree(dx));
+    HIP_CHECK(hipFree(dy));
+    HIP_CHECK(hipFree(workspace));
  }
 }

 /**
-* End doxygen group perfComputeTest.
-* @}
-*/
+ * End doxygen group perfComputeTest.
+ * @}
+ */
@@ -18,10 +18,10 @@
 */

 /**
-* @addtogroup hipPerfMandelbrot hipPerfMandelbrot
-* @{
-* @ingroup perfComputeTest
-*/
+ * @addtogroup hipPerfMandelbrot hipPerfMandelbrot
+ * @{
+ * @ingroup perfComputeTest
+ */

 #include <hip_test_common.hh>
 #include <hip/hip_vector_types.h>
@@ -45,36 +45,35 @@ coordRec coords[] = {
 static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);

 template <typename T>
-__global__ void float_mad_kernel(uint *out, uint width, T xPos, T yPos,
-                                   T xStep, T yStep, uint maxIter) {
+__global__ void float_mad_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep,
+                                 uint maxIter) {
  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
  int i = tid % width;
  int j = tid / width;
-  float x0 = static_cast<float>(xPos + xStep*i);
-  float y0 = static_cast<float>(yPos + yStep*j);
+  float x0 = static_cast<float>(xPos + xStep * i);
+  float y0 = static_cast<float>(yPos + yStep * j);

  float x = x0;
  float y = y0;

  uint iter = 0;
  float tmp;
-  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
+  for (iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) {
    tmp = x;
    x = fma(-y, y, fma(x, x, x0));
-    y = fma(2.0f*tmp, y, y0);
+    y = fma(2.0f * tmp, y, y0);
  }
  out[tid] = iter;
 }

 template <typename T>
-__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
-    T yPos, T xStep, T yStep, uint maxIter) {
-
+__global__ void float_mandel_unroll_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep,
+                                           uint maxIter) {
  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
  int i = tid % width;
  int j = tid / width;
-  float x0 = static_cast<float>(xPos + xStep*static_cast<float>(i));
-  float y0 = static_cast<float>(yPos + yStep*static_cast<float>(j));
+  float x0 = static_cast<float>(xPos + xStep * static_cast<float>(i));
+  float y0 = static_cast<float>(yPos + yStep * static_cast<float>(j));

  float x = x0;
  float y = y0;
@@ -84,72 +83,71 @@ __global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
  float tmp;
  int stay;
  int ccount = 0;
-  stay = (x*x+y*y) <= 4.0;
+  stay = (x * x + y * y) <= 4.0;
  float savx = x;
  float savy = y;
 #ifdef FAST
-  for (iter = 0; (iter < maxIter); iter+=16) {
+  for (iter = 0; (iter < maxIter); iter += 16) {
 #else
-  for (iter = 0; stay && (iter < maxIter); iter+=16) {
+  for (iter = 0; stay && (iter < maxIter); iter += 16) {
 #endif
    x = savx;
    y = savy;

    // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =  fma(2.0f*x, y, y0);
-    x =  fma(-y, y, fma(tmp, tmp, x0));
-    y =  fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);

    // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =  fma(2.0f*x, y, y0);
-    x =  fma(-y, y, fma(tmp, tmp, x0));
-    y =  fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);

    // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =  fma(2.0f*x, y, y0);
-    x =  fma(-y, y, fma(tmp, tmp, x0));
-    y =  fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);

    // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =  fma(2.0f*x, y, y0);
-    x =  fma(-y, y, fma(tmp, tmp, x0));
-    y =  fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);

    // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =  fma(2.0f*x, y, y0);
-    x =  fma(-y, y, fma(tmp, tmp, x0));
-    y =  fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);

    // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =  fma(2.0f*x, y, y0);
-    x =  fma(-y, y, fma(tmp, tmp, x0));
-    y =  fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);

    // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =  fma(2.0f*x, y, y0);
-    x =  fma(-y, y, fma(tmp, tmp, x0));
-    y =  fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);

    // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =  fma(2.0f*x, y, y0);
-    x =  fma(-y, y, fma(tmp, tmp, x0));
-    y =  fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);

-    stay = (x*x+y*y) <= 4.0;
+    stay = (x * x + y * y) <= 4.0;
    savx = (stay ? x : savx);
    savy = (stay ? y : savy);
-    ccount += stay*16;
+    ccount += stay * 16;
 #ifdef FAST
-    if (!stay)
-      break;
+    if (!stay) break;
 #endif
  }
  // Handle remainder
@@ -158,10 +156,10 @@ __global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
    do {
      x = savx;
      y = savy;
-      stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
+      stay = ((x * x + y * y) <= 4.0) && (ccount < maxIter);
      tmp = x;
-      x =  fma(-y, y, fma(x, x, x0));
-      y =  fma(2.0f*tmp, y, y0);
+      x = fma(-y, y, fma(x, x, x0));
+      y = fma(2.0f * tmp, y, y0);
      ccount += stay;
      iter--;
      savx = (stay ? x : savx);
@@ -172,36 +170,36 @@ __global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
 }

 template <typename T>
-__global__ void double_mad_kernel(uint *out, uint width, T xPos,  T yPos, T xStep, T yStep,
-                                   uint maxIter) {
+__global__ void double_mad_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep,
+                                  uint maxIter) {
  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
  int i = tid % width;
  int j = tid / width;
-  double x0 = static_cast<double>(xPos + xStep*i);
-  double y0 = static_cast<double>(yPos + yStep*j);
+  double x0 = static_cast<double>(xPos + xStep * i);
+  double y0 = static_cast<double>(yPos + yStep * j);

  double x = x0;
  double y = y0;

  uint iter = 0;
  double tmp;
-  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
+  for (iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) {
    tmp = x;
-    x = fma(-y, y,fma(x, x, x0));
-    y = fma(2.0f*tmp, y, y0);
+    x = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * tmp, y, y0);
  }
  out[tid] = iter;
 };

 template <typename T>
-__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
-                  T yPos, T xStep, T yStep, uint maxIter) {
+__global__ void double_mandel_unroll_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep,
+                                            uint maxIter) {
  int tid = (blockIdx.x * blockDim.x + threadIdx.x);

  int i = tid % width;
  int j = tid / width;
-  double x0 = static_cast<double>(xPos + xStep*static_cast<double>(i));
-  double y0 = static_cast<double>(yPos + yStep*static_cast<double>(j));
+  double x0 = static_cast<double>(xPos + xStep * static_cast<double>(i));
+  double y0 = static_cast<double>(yPos + yStep * static_cast<double>(j));

  double x = x0;
  double y = y0;
@@ -211,13 +209,13 @@ __global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
  double tmp;
  int stay;
  int ccount = 0;
-  stay = (x*x+y*y) <= 4.0;
+  stay = (x * x + y * y) <= 4.0;
  double savx = x;
  double savy = y;
 #ifdef FAST
-  for (iter = 0; (iter < maxIter); iter+=16)
+  for (iter = 0; (iter < maxIter); iter += 16)
 #else
-  for (iter = 0; stay && (iter < maxIter); iter+=16)
+  for (iter = 0; stay && (iter < maxIter); iter += 16)
 #endif
  {
    x = savx;
@@ -225,141 +223,131 @@ __global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,

    // Two iterations
    tmp = fma(-y, y, fma(x, x, x0));
-    y =   fma(2.0f*x, y, y0);
-    x =   fma(-y, y, fma(tmp, tmp, x0));
-    y =   fma(2.0f*tmp, y, y0);
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);

    // Two iterations
    tmp = fma(-y, y, fma(x, x, x0));
-    y =   fma(2.0f*x, y, y0);
-    x =   fma(-y, y, fma(tmp, tmp, x0));
-    y =   fma(2.0f*tmp, y, y0);
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);

    // Two iterations
    tmp = fma(-y, y, fma(x, x, x0));
-    y =   fma(2.0f*x, y, y0);
-    x =   fma(-y, y, fma(tmp, tmp, x0));
-    y =   fma(2.0f*tmp, y, y0);
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);

    // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y, y, fma(tmp, tmp, x0));
-    y =    fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);

    // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =    fma(2.0f*x, y, y0);
-    x =    fma(-y, y, fma(tmp, tmp, x0));
-    y =    fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);

    // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =    fma(2.0f*x, y, y0);
-    x =    fma(-y, y, fma(tmp, tmp, x0));
-    y =    fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);

    // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =    fma(2.0f*x, y, y0);
-    x =    fma(-y, y, fma(tmp, tmp, x0));
-    y =    fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);

    // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =    fma(2.0f*x, y, y0);
-    x =    fma(-y, y, fma(tmp, tmp, x0));
-    y =    fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);

-    stay = (x*x+y*y) <= 4.0;
+    stay = (x * x + y * y) <= 4.0;
    savx = (stay ? x : savx);
    savy = (stay ? y : savy);
-    ccount += stay*16;
+    ccount += stay * 16;
 #ifdef FAST
-    if (!stay)
-      break;
+    if (!stay) break;
 #endif
-    }
+  }
  // Handle remainder
-    if (!stay) {
-      iter = 16;
-      do {
-        x = savx;
-        y = savy;
-        stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
-        tmp = x;
-        x =  fma(-y,y, fma(x, x, x0));
-        y =  fma(2.0f*tmp,y,y0);
-        ccount += stay;
-        iter--;
-        savx = (stay ? x : savx);
-        savy = (stay ? y : savy);
-      }
-      while (stay && iter);
-    }
-    out[tid] = (uint)ccount;
+  if (!stay) {
+    iter = 16;
+    do {
+      x = savx;
+      y = savy;
+      stay = ((x * x + y * y) <= 4.0) && (ccount < maxIter);
+      tmp = x;
+      x = fma(-y, y, fma(x, x, x0));
+      y = fma(2.0f * tmp, y, y0);
+      ccount += stay;
+      iter--;
+      savx = (stay ? x : savx);
+      savy = (stay ? y : savy);
+    } while (stay && iter);
+  }
+  out[tid] = (uint)ccount;
 };

 // Expected results for each kernel run at each coord
 unsigned long long expectedIters[] = {
-    203277748ull,  2147483648ull, 120254651ull,  203277748ull,  2147483648ull,
-    120254651ull,  203277748ull,  2147483648ull, 120254651ull,  203315114ull,
-    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull,
-    203280620ull,  2147483648ull, 120485704ull,  203280620ull,  2147483648ull,
-    120485704ull,  203280620ull,  2147483648ull, 120485704ull,  203315114ull,
-    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull};
+    203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull, 120254651ull,
+    203277748ull, 2147483648ull, 120254651ull, 203315114ull, 2147483648ull, 120042599ull,
+    203315114ull, 2147483648ull, 120042599ull, 203280620ull, 2147483648ull, 120485704ull,
+    203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull, 120485704ull,
+    203315114ull, 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull};

 class hipPerfMandelBrot {
 public:
  hipPerfMandelBrot();
  ~hipPerfMandelBrot();

-  void setNumKernels(unsigned int num) {
-    numKernels = num;
-  }
+  void setNumKernels(unsigned int num) { numKernels = num; }

-  unsigned int getNumKernels() {
-    return numKernels;
-  }
+  unsigned int getNumKernels() { return numKernels; }

-  void setNumStreams(unsigned int num) {
-    numStreams = num;
-  }
-  unsigned int getNumStreams() {
-    return numStreams;
-  }
+  void setNumStreams(unsigned int num) { numStreams = num; }
+  unsigned int getNumStreams() { return numStreams; }

  void open(int deviceID);
  bool run(unsigned int testCase);
  void printResults(void);

  // array of funtion pointers
-  typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos,  float yPos,
-                 float xStep, float yStep, uint maxIter,  hipStream_t* streams, int blocks,
-                 int threads_per_block, int kernelCnt);
+  typedef void (hipPerfMandelBrot::*funPtr)(uint* out, uint width, float xPos, float yPos,
+                                            float xStep, float yStep, uint maxIter,
+                                            hipStream_t* streams, int blocks, int threads_per_block,
+                                            int kernelCnt);

  // Wrappers
-  void float_mad(uint *out, uint width, float xPos,  float yPos,
-                  float xStep, float yStep, uint maxIter, hipStream_t* streams,
-                  int blocks, int threads_per_block, int kernelCnt);
+  void float_mad(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
+                 uint maxIter, hipStream_t* streams, int blocks, int threads_per_block,
+                 int kernelCnt);

-  void float_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
-                            float xStep, float yStep, uint maxIter, hipStream_t* streams,
-                            int blocks, int threads_per_block, int kernelCnt);
+  void float_mandel_unroll(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
+                           uint maxIter, hipStream_t* streams, int blocks, int threads_per_block,
+                           int kernelCnt);

-  void double_mad(uint *out, uint width, float xPos,  float yPos, float xStep,
-                   float yStep, uint maxIter, hipStream_t* streams, int blocks,
-                   int threads_per_block, int kernelCnt);
+  void double_mad(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
+                  uint maxIter, hipStream_t* streams, int blocks, int threads_per_block,
+                  int kernelCnt);

-  void double_mandel_unroll(uint *out, uint width, float xPos,  float yPos, float xStep,
-                             float yStep, uint maxIter, hipStream_t* streams, int blocks,
-                             int threads_per_block, int kernelCnt);
+  void double_mandel_unroll(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
+                            uint maxIter, hipStream_t* streams, int blocks, int threads_per_block,
+                            int kernelCnt);

  hipStream_t streams[2];

 private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
+  void setData(void* ptr, unsigned int value);
+  void checkData(uint* ptr);

  unsigned int numKernels;
  unsigned int numStreams;
@@ -387,9 +375,9 @@ void hipPerfMandelBrot::open(int deviceId) {
  HIP_CHECK(hipSetDevice(deviceId));
  hipDeviceProp_t props;
  HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-    << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
-    << std::endl;
+
+  CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device id: %d\n", props.pciBusID,
+                props.name, props.multiProcessorCount, deviceId);

  numCUs = props.multiProcessorCount;
 }
@@ -397,52 +385,52 @@ void hipPerfMandelBrot::open(int deviceId) {
 void hipPerfMandelBrot::printResults() {
  int numStreams = getNumStreams();

-  std::cout << "\n" <<"Measured perf for kernels in GFLOPS on "
-            << numStreams << " streams (s)" <<  std::endl;
+  CONSOLE_PRINT("Measured perf for kernels in GFLOPS on %d streams (s)", numStreams);

-  std::map<std::string, std::vector<double>>:: iterator itr;
+  std::map<std::string, std::vector<double>>::iterator itr;
  for (itr = results.begin(); itr != results.end(); itr++) {
-          std::cout << "\n" << std::setw(20) << itr->first << " ";
-          for (auto i : results[itr->first]) {
-            std::cout << std::setw(10) << i << " ";
-            }
-     }
+    CONSOLE_PRINT("\n%s ", itr->first.c_str());
+    for (auto i : results[itr->first]) {
+      CONSOLE_PRINT("%10f ", i);
+    }
+  }
  results.clear();
-  std::cout << std::endl;
+  CONSOLE_PRINT("\n");
 }

 // Wrappers for the kernel launches
-void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos,  float yPos, float xStep,
-                                   float yStep, uint maxIter, hipStream_t* streams,
-                                   int blocks, int threads_per_block, int kernelCnt) {
+void hipPerfMandelBrot::float_mad(uint* out, uint width, float xPos, float yPos, float xStep,
+                                  float yStep, uint maxIter, hipStream_t* streams, int blocks,
+                                  int threads_per_block, int kernelCnt) {
  int streamCnt = getNumStreams();
  hipLaunchKernelGGL(float_mad_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
-                      streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep,
-                      maxIter);
+                     streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
 }

-void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
-                             float xStep, float yStep, uint maxIter, hipStream_t * streams,
-                             int blocks, int threads_per_block, int kernelCnt) {
+void hipPerfMandelBrot::float_mandel_unroll(uint* out, uint width, float xPos, float yPos,
+                                            float xStep, float yStep, uint maxIter,
+                                            hipStream_t* streams, int blocks, int threads_per_block,
+                                            int kernelCnt) {
  int streamCnt = getNumStreams();
  hipLaunchKernelGGL(float_mandel_unroll_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
-                  streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
+                     streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
 }

-void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos,  float yPos,
-                               float xStep, float yStep, uint maxIter, hipStream_t * streams,
-                               int blocks, int threads_per_block, int kernelCnt) {
+void hipPerfMandelBrot::double_mad(uint* out, uint width, float xPos, float yPos, float xStep,
+                                   float yStep, uint maxIter, hipStream_t* streams, int blocks,
+                                   int threads_per_block, int kernelCnt) {
  int streamCnt = getNumStreams();
  hipLaunchKernelGGL(double_mad_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
-                  streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
+                     streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
 }

-void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
-                              float xStep, float yStep, uint maxIter, hipStream_t * streams,
-                              int blocks, int threads_per_block, int kernelCnt) {
+void hipPerfMandelBrot::double_mandel_unroll(uint* out, uint width, float xPos, float yPos,
+                                             float xStep, float yStep, uint maxIter,
+                                             hipStream_t* streams, int blocks,
+                                             int threads_per_block, int kernelCnt) {
  int streamCnt = getNumStreams();
  hipLaunchKernelGGL(float_mandel_unroll_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
-                  streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
+                     streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
 }

 bool hipPerfMandelBrot::run(unsigned int testCase) {
@@ -450,18 +438,18 @@ bool hipPerfMandelBrot::run(unsigned int testCase) {
  coordIdx = testCase % numCoords;

  funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
-               &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
+                &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};

  // Maximum iteration count
  maxIter = 32768;

-  uint ** hPtr = new uint *[numKernels];
-  uint ** dPtr = new uint *[numKernels];
+  uint** hPtr = new uint*[numKernels];
+  uint** dPtr = new uint*[numKernels];

  // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
  width_ = 256;

-  bufSize = width_  * width_ * sizeof(uint);
+  bufSize = width_ * width_ * sizeof(uint);

  // Create streams for concurrency
  for (uint i = 0; i < numStreams; i++) {
@@ -470,15 +458,15 @@ bool hipPerfMandelBrot::run(unsigned int testCase) {

  // Allocate memory on the host and device
  for (uint i = 0; i < numKernels; i++) {
-    HIP_CHECK(hipHostMalloc(reinterpret_cast<void **>(&hPtr[i]), bufSize, hipHostMallocDefault));
+    HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&hPtr[i]), bufSize, hipHostMallocDefault));
    setData(hPtr[i], 0xdeadbeef);
-    HIP_CHECK(hipMalloc(reinterpret_cast<uint **>(&dPtr[i]), bufSize))
+    HIP_CHECK(hipMalloc(reinterpret_cast<uint**>(&dPtr[i]), bufSize))
  }

  // Prepare kernel launch parameters
-  int threads = (bufSize/sizeof(uint));
-  int threads_per_block  = 64;
-  int blocks = (threads/threads_per_block) + (threads % threads_per_block);
+  int threads = (bufSize / sizeof(uint));
+  int threads_per_block = 64;
+  int blocks = (threads / threads_per_block) + (threads % threads_per_block);

  // Copy memory asynchronously and concurrently from host to device
  for (uint i = 0; i < numKernels; i++) {
@@ -489,90 +477,88 @@ bool hipPerfMandelBrot::run(unsigned int testCase) {
  HIP_CHECK(hipStreamSynchronize(0));

  int kernelIdx;
-  if(testCase == 0 || testCase == 5 || testCase == 10) {
+  if (testCase == 0 || testCase == 5 || testCase == 10) {
    kernelIdx = 0;
-  } else if(testCase == 1 || testCase == 6 || testCase == 11) {
+  } else if (testCase == 1 || testCase == 6 || testCase == 11) {
    kernelIdx = 1;
-  } else if(testCase == 2 || testCase == 7 || testCase == 12) {
+  } else if (testCase == 2 || testCase == 7 || testCase == 12) {
    kernelIdx = 2;
-  } else if(testCase == 3 || testCase == 8 || testCase == 13){
+  } else if (testCase == 3 || testCase == 8 || testCase == 13) {
    kernelIdx = 3;
  }
  double totalTime = 0.0;
  for (unsigned int k = 0; k < numLoops; k++) {
-  if ((testCase == 0 || testCase == 1 || testCase == 2 ||
-                  testCase == 5 || testCase == 6 || testCase == 7 ||
-                  testCase == 10 || testCase == 11 || testCase == 12)) {
-  float xStep = static_cast<float>(coords[coordIdx].width / static_cast<double>(width_));
-  float yStep = static_cast<float>(-coords[coordIdx].width / static_cast<double>(width_));
-  float xPos = static_cast<float>(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
-  float yPos = static_cast<float>(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
+    if ((testCase == 0 || testCase == 1 || testCase == 2 || testCase == 5 || testCase == 6 ||
+         testCase == 7 || testCase == 10 || testCase == 11 || testCase == 12)) {
+      float xStep = static_cast<float>(coords[coordIdx].width / static_cast<double>(width_));
+      float yStep = static_cast<float>(-coords[coordIdx].width / static_cast<double>(width_));
+      float xPos = static_cast<float>(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
+      float yPos = static_cast<float>(coords[coordIdx].y + 0.5 * coords[coordIdx].width);

-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
+      // Time the kernel execution
+      auto all_start = std::chrono::steady_clock::now();

-  for (uint i = 0; i < numKernels; i++) {
-    (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
-                           threads_per_block, i);
-  }
+      for (uint i = 0; i < numKernels; i++) {
+        (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
+                              threads_per_block, i);
+      }

-  // Synchronize all the concurrent streams to have completed execution
-  HIP_CHECK(hipStreamSynchronize(0));
+      // Synchronize all the concurrent streams to have completed execution
+      HIP_CHECK(hipStreamSynchronize(0));

-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-  totalTime += all_kernel_time.count();
-  } else {
-  double xStep = coords[coordIdx].width / static_cast<double>(width_);
-  double yStep = -coords[coordIdx].width / static_cast<double>(width_);
-  double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
-  double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
+      auto all_end = std::chrono::steady_clock::now();
+      std::chrono::duration<double> all_kernel_time = all_end - all_start;
+      totalTime += all_kernel_time.count();
+    } else {
+      double xStep = coords[coordIdx].width / static_cast<double>(width_);
+      double yStep = -coords[coordIdx].width / static_cast<double>(width_);
+      double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
+      double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;

-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-  for (uint i = 0; i < numKernels; i++) {
-  (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
-                         threads_per_block, i);
-  }
-  // Synchronize all the concurrent streams to have completed execution
-  HIP_CHECK(hipStreamSynchronize(0));
+      // Time the kernel execution
+      auto all_start = std::chrono::steady_clock::now();
+      for (uint i = 0; i < numKernels; i++) {
+        (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
+                              threads_per_block, i);
+      }
+      // Synchronize all the concurrent streams to have completed execution
+      HIP_CHECK(hipStreamSynchronize(0));

-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-  totalTime += all_kernel_time.count();
-  }
+      auto all_end = std::chrono::steady_clock::now();
+      std::chrono::duration<double> all_kernel_time = all_end - all_start;
+      totalTime += all_kernel_time.count();
+    }
  }

  // Copy data back from device to the host
-  for(uint i = 0; i < numKernels; i++) {
-    HIP_CHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost));
-  }
-  for(uint i = 0; i < numKernels; i++) {
-  checkData(hPtr[i]);
-  int j =0;
-  while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
-          j++;
+  for (uint i = 0; i < numKernels; i++) {
+    HIP_CHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
  }
+  for (uint i = 0; i < numKernels; i++) {
+    checkData(hPtr[i]);
+    int j = 0;
+    while ((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
+      j++;
+    }

-  if(j==30) {
-    std::cout << "Incorrect iteration count detected. ";
-  }
+    if (j == 30) {
+      CONSOLE_PRINT("Incorrect iteration count detected. ");
+    }
  }

  // Compute GFLOPS.  There are 7 FLOPs per iteration
-  double perf = (static_cast<double>(totalIters*numKernels) * 7 * static_cast<double>(1e-09)) /
-                (totalTime / (double)numLoops);
+  double perf = (static_cast<double>(totalIters * numKernels) * 7 * static_cast<double>(1e-09)) /
+      (totalTime / (double)numLoops);


-  std::vector<std::string> kernelName = {"float", "float_unroll",
-                      "double", "double_unroll"};
+  std::vector<std::string> kernelName = {"float", "float_unroll", "double", "double_unroll"};

  // Print results except for Warm-up kernel
  if (testCase != 100) {
-  results[kernelName[testCase % 4]].push_back(perf);
- }
+    results[kernelName[testCase % 4]].push_back(perf);
+  }

-  for(uint i = 0 ; i < numStreams; i++) {
+  for (uint i = 0; i < numStreams; i++) {
    HIP_CHECK(hipStreamDestroy(streams[i]));
  }

@@ -581,19 +567,19 @@ bool hipPerfMandelBrot::run(unsigned int testCase) {
    HIP_CHECK(hipHostFree(hPtr[i]));
    HIP_CHECK(hipFree(dPtr[i]));
  }
-  delete [] hPtr;
-  delete [] dPtr;
+  delete[] hPtr;
+  delete[] dPtr;
  return true;
 }

-void hipPerfMandelBrot::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
+void hipPerfMandelBrot::setData(void* ptr, unsigned int value) {
+  unsigned int* ptr2 = (unsigned int*)ptr;
  for (unsigned int i = 0; i < width_ * width_; i++) {
-      ptr2[i] = value;
+    ptr2[i] = value;
  }
 }

-void hipPerfMandelBrot::checkData(uint *ptr) {
+void hipPerfMandelBrot::checkData(uint* ptr) {
  totalIters = 0;
  for (unsigned int i = 0; i < width_ * width_; i++) {
    totalIters += ptr[i];
@@ -601,30 +587,30 @@ void hipPerfMandelBrot::checkData(uint *ptr) {
 }

 /**
-* Test Description
-* ------------------------
-*  - Verify the warm-up kernel default stream executes serially.
-*  - verify by running all kernels - sync.
-*  - verify by running all kernels - async.
-* Test source
-* ------------------------
-*  - perftests/compute/hipPerfMandelbrot.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify the warm-up kernel default stream executes serially.
+ *  - verify by running all kernels - sync.
+ *  - verify by running all kernels - async.
+ * Test source
+ * ------------------------
+ *  - perftests/compute/hipPerfMandelbrot.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */

 TEST_CASE("Perf_hipPerfMandelbrot") {
  hipPerfMandelBrot mandelbrotCompute;
  int deviceId = 0;
  mandelbrotCompute.open(deviceId);
-  #if HT_AMD
+#if HT_AMD
  SECTION("warm-up kernel default stream executes serially") {
    mandelbrotCompute.setNumStreams(1);
    mandelbrotCompute.setNumKernels(1);
-    REQUIRE(true == mandelbrotCompute.run(100/*Random number*/));
+    REQUIRE(true == mandelbrotCompute.run(100 /*Random number*/));
  }
-  #endif
+#endif
  SECTION("run all - sync") {
    int i = 0;
    do {
@@ -632,7 +618,7 @@ TEST_CASE("Perf_hipPerfMandelbrot") {
      mandelbrotCompute.setNumKernels(1);
      REQUIRE(true == mandelbrotCompute.run(i));
      i++;
-    }while(i < 12);
+    } while (i < 12);
    mandelbrotCompute.printResults();
  }

@@ -643,12 +629,12 @@ TEST_CASE("Perf_hipPerfMandelbrot") {
      mandelbrotCompute.setNumKernels(2);
      REQUIRE(true == mandelbrotCompute.run(i));
      i++;
-    }while(i < 12);
+    } while (i < 12);
    mandelbrotCompute.printResults();
  }
 }

 /**
-* End doxygen group perfComputeTest.
-* @}
-*/
+ * End doxygen group perfComputeTest.
+ * @}
+ */
@@ -18,12 +18,12 @@
 */

 /**
-* @addtogroup hipPerfDeviceConcurrency hipPerfDeviceConcurrency
-* @{
-* @ingroup perfStreamTest
-* `hipError_t hipStreamCreate(hipStream_t* stream)` -
-* Create an asynchronous stream.
-*/
+ * @addtogroup hipPerfDeviceConcurrency hipPerfDeviceConcurrency
+ * @{
+ * @ingroup perfStreamTest
+ * `hipError_t hipStreamCreate(hipStream_t* stream)` -
+ * Create an asynchronous stream.
+ */

 #include <hip_test_common.hh>

@@ -34,28 +34,28 @@ typedef struct {
 } coordRec;

 static coordRec coords[] = {
-    {0.0, 0.0, 0.00001},         // All black
+    {0.0, 0.0, 0.00001},  // All black
 };

 static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);

-__global__ void mandelbrot(uint *out, uint width, float xPos,
-                          float yPos, float xStep, float yStep, uint maxIter) {
+__global__ void mandelbrot(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
+                           uint maxIter) {
  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
  int i = tid % width;
  int j = tid / width;
-  float x0 = static_cast<float>(xPos + xStep*i);
-  float y0 = static_cast<float>(yPos + yStep*j);
+  float x0 = static_cast<float>(xPos + xStep * i);
+  float y0 = static_cast<float>(yPos + yStep * j);

  float x = x0;
  float y = y0;

  uint iter = 0;
  float tmp;
-  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
+  for (iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) {
    tmp = x;
    x = fma(-y, y, fma(x, x, x0));
-    y = fma(2.0f*tmp, y, y0);
+    y = fma(2.0f * tmp, y, y0);
  }
  out[tid] = iter;
 };
@@ -65,20 +65,16 @@ class hipPerfDeviceConcurrency {
  hipPerfDeviceConcurrency();
  ~hipPerfDeviceConcurrency();

-  void setNumGpus(unsigned int num) {
-    numDevices = num;
-  }
-  unsigned int getNumGpus() {
-    return numDevices;
-  }
+  void setNumGpus(unsigned int num) { numDevices = num; }
+  unsigned int getNumGpus() { return numDevices; }

  void open(void);
  void close(void);
  bool run(unsigned int testCase, int numGpus);

 private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
+  void setData(void* ptr, unsigned int value);
+  void checkData(uint* ptr);

  unsigned int numDevices;
  unsigned int width_;
@@ -100,17 +96,16 @@ void hipPerfDeviceConcurrency::open(void) {
  }
 }

-void hipPerfDeviceConcurrency::close() {
-}
+void hipPerfDeviceConcurrency::close() {}

 bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
  static int deviceId;
-  uint ** hPtr = new uint*[numGpus];
-  uint ** dPtr = new uint*[numGpus];
-  hipStream_t * streams = new hipStream_t[numGpus];
-  int *numCUs = new int[numGpus];
-  unsigned int *maxIter = new unsigned int[numGpus];
-  unsigned long long *expectedIters = new unsigned long long[numGpus];
+  uint** hPtr = new uint*[numGpus];
+  uint** dPtr = new uint*[numGpus];
+  hipStream_t* streams = new hipStream_t[numGpus];
+  int* numCUs = new int[numGpus];
+  unsigned int* maxIter = new unsigned int[numGpus];
+  unsigned long long* expectedIters = new unsigned long long[numGpus];

  int threads, threads_per_block, blocks;
  float xStep, yStep, xPos, yPos;
@@ -124,25 +119,21 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
    hipDeviceProp_t props;
    HIP_CHECK(hipGetDeviceProperties(&props, i));
    if (testCase != 0) {
-    std::cout << "info: running on bus " << "0x" << props.pciBusID
-              << " " << props.name << " with " << props.multiProcessorCount
-              << " CUs" << " and device ID: " << i << std::endl;
+      CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device ID: %d", props.pciBusID,
+                    props.name, props.multiProcessorCount, i);
    }
-
    numCUs[i] = props.multiProcessorCount;
    int clkFrequency = 0;
-    HIP_CHECK(hipDeviceGetAttribute(&clkFrequency,
-                                    hipDeviceAttributeClockRate, i));
+    HIP_CHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i));
    if (clkFrequency == 0) {
-      std::cout << "clkFrequency = 0, set it to 1000000\n";
+      CONSOLE_PRINT("clkFrequency = 0, set it to 1000000");
      clkFrequency = 1000000;
    }
-    clkFrequency =(unsigned int)clkFrequency/1000;
+    clkFrequency = (unsigned int)clkFrequency / 1000;

    // Maximum iteration count
    // maxIter = 8388608 * (engine_clock / 1000).serial execution
-    maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000))
-                                              * numCUs[i]) / 128);
+    maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128);
    maxIter[i] = (maxIter[i] + 15) & ~15;

    // Width is divisible by 4 because the mandelbrot
@@ -153,15 +144,14 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
    HIP_CHECK(hipStreamCreate(&streams[i]));

    // Allocate memory on the host and device
-    HIP_CHECK(hipHostMalloc(reinterpret_cast<void **>(&hPtr[i]),
-                            bufSize, hipHostMallocDefault));
+    HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&hPtr[i]), bufSize, hipHostMallocDefault));
    setData(hPtr[i], 0xdeadbeef);
-    HIP_CHECK(hipMalloc(reinterpret_cast<uint **>(&dPtr[i]), bufSize))
+    HIP_CHECK(hipMalloc(reinterpret_cast<uint**>(&dPtr[i]), bufSize))

    // Prepare kernel launch parameters
-    threads = (bufSize/sizeof(uint));
-    threads_per_block  = 64;
-    blocks = (threads/threads_per_block) + (threads % threads_per_block);
+    threads = (bufSize / sizeof(uint));
+    threads_per_block = 64;
+    blocks = (threads / threads_per_block) + (threads % threads_per_block);

    coordIdx = testCase % numCoords;
    xStep = static_cast<float>(coords[coordIdx].width / static_cast<double>(width_));
@@ -180,10 +170,9 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
      deviceId = i;
    }

-  HIP_CHECK(hipSetDevice(deviceId));
-  hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0,
-                    streams[i], dPtr[i], width_, xPos, yPos, xStep,
-                    yStep, maxIter[i]);
+    HIP_CHECK(hipSetDevice(deviceId));
+    hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i], dPtr[i],
+                       width_, xPos, yPos, xStep, yStep, maxIter[i]);
  }
  for (int i = 0; i < numGpus; i++) {
    HIP_CHECK(hipStreamSynchronize(0));
@@ -192,8 +181,8 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
  auto all_end = std::chrono::steady_clock::now();
  std::chrono::duration<double> all_kernel_time = all_end - all_start;

-  for(int i = 0; i < numGpus; i++) {
-    if(testCase != 0) {
+  for (int i = 0; i < numGpus; i++) {
+    if (testCase != 0) {
      deviceId = i;
    }
    HIP_CHECK(hipSetDevice(deviceId));
@@ -201,11 +190,11 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
    // Copy data back from device to the host
    HIP_CHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
    checkData(hPtr[i]);
-    expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i];
+    expectedIters[i] = width_ * width_ * (unsigned long long)maxIter[i];
    if (testCase != 0) {
      checkData(hPtr[i]);
      if (totalIters != expectedIters[i]) {
-        std::cout << "Incorrect iteration count detected" << std::endl;
+        CONSOLE_PRINT("Incorrect iteration count detected");
      }
    }

@@ -216,31 +205,30 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
  }

  if (testCase != 0) {
-  std::cout << '\n' << "Measured time for kernel computation on " << numGpus
-            << " device (s): " << all_kernel_time.count() << " (s) "
-            << '\n' << std::endl;
+    CONSOLE_PRINT("\nMeasured time for kernel computation on %d device(s): %.6f (s)\n", numGpus,
+                  all_kernel_time.count());
  }

  if (testCase == 0) {
    deviceId++;
  }
-  delete [] hPtr;
-  delete [] dPtr;
-  delete [] streams;
-  delete [] numCUs;
-  delete [] maxIter;
-  delete [] expectedIters;
+  delete[] hPtr;
+  delete[] dPtr;
+  delete[] streams;
+  delete[] numCUs;
+  delete[] maxIter;
+  delete[] expectedIters;
  return true;
 }

-void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
-  for (unsigned int i = 0; i < width_ * width_ ; i++) {
-      ptr2[i] = value;
+void hipPerfDeviceConcurrency::setData(void* ptr, unsigned int value) {
+  unsigned int* ptr2 = (unsigned int*)ptr;
+  for (unsigned int i = 0; i < width_ * width_; i++) {
+    ptr2[i] = value;
  }
 }

-void hipPerfDeviceConcurrency::checkData(uint *ptr) {
+void hipPerfDeviceConcurrency::checkData(uint* ptr) {
  totalIters = 0;
  for (unsigned int i = 0; i < width_ * width_; i++) {
    totalIters += ptr[i];
@@ -248,16 +236,16 @@ void hipPerfDeviceConcurrency::checkData(uint *ptr) {
 }

 /**
-* Test Description
-* ------------------------
-*  - Verify the different levels of device concurrency.
-* Test source
-* ------------------------
-*  - perftests/stream/hipPerfDeviceConcurrency.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify the different levels of device concurrency.
+ * Test source
+ * ------------------------
+ *  - perftests/stream/hipPerfDeviceConcurrency.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */

 TEST_CASE("Perf_hipPerfDeviceConcurrency") {
  hipPerfDeviceConcurrency deviceConcurrency;
@@ -279,6 +267,6 @@ TEST_CASE("Perf_hipPerfDeviceConcurrency") {
 }

 /**
-* End doxygen group perfStreamTest.
-* @}
-*/
+ * End doxygen group perfStreamTest.
+ * @}
+ */
@@ -18,12 +18,12 @@
 */

 /**
-* @addtogroup hipPerfStreamConcurrency hipPerfStreamConcurrency
-* @{
-* @ingroup perfComputeTest
-* `hipError_t hipStreamCreate(hipStream_t* stream)` -
-* Create an asynchronous stream.
-*/
+ * @addtogroup hipPerfStreamConcurrency hipPerfStreamConcurrency
+ * @{
+ * @ingroup perfComputeTest
+ * `hipError_t hipStreamCreate(hipStream_t* stream)` -
+ * Create an asynchronous stream.
+ */

 #include <hip_test_common.hh>
 #include <hip/hip_vector_types.h>
@@ -55,23 +55,23 @@ static coordRec coords[] = {

 static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);

-__global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
-         float xStep, float yStep, uint maxIter) {
+__global__ static void mandelbrot(uint* out, uint width, float xPos, float yPos, float xStep,
+                                  float yStep, uint maxIter) {
  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % (width/4);
-  int j = tid / (width/4);
-  int4 veci = make_int4(4*i, 4*i+1, 4*i+2, 4*i+3);
+  int i = tid % (width / 4);
+  int j = tid / (width / 4);
+  int4 veci = make_int4(4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3);
  int4 vecj = make_int4(j, j, j, j);
  float4 x0;
-  x0.x = static_cast<float>(xPos + xStep*veci.x);
-  x0.y = static_cast<float>(xPos + xStep*veci.y);
-  x0.z = static_cast<float>(xPos + xStep*veci.z);
-  x0.w = static_cast<float>(xPos + xStep*veci.w);
+  x0.x = static_cast<float>(xPos + xStep * veci.x);
+  x0.y = static_cast<float>(xPos + xStep * veci.y);
+  x0.z = static_cast<float>(xPos + xStep * veci.z);
+  x0.w = static_cast<float>(xPos + xStep * veci.w);
  float4 y0;
-  y0.x = static_cast<float>(yPos + yStep*vecj.x);
-  y0.y = static_cast<float>(yPos + yStep*vecj.y);
-  y0.z = static_cast<float>(yPos + yStep*vecj.z);
-  y0.w = static_cast<float>(yPos + yStep*vecj.w);
+  y0.x = static_cast<float>(yPos + yStep * vecj.x);
+  y0.y = static_cast<float>(yPos + yStep * vecj.y);
+  y0.z = static_cast<float>(yPos + yStep * vecj.z);
+  y0.w = static_cast<float>(yPos + yStep * vecj.w);
  float4 x = x0;
  float4 y = y0;
  uint iter = 0;
@@ -80,53 +80,52 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
  int4 ccount = make_int4(0, 0, 0, 0);
  float4 savx = x;
  float4 savy = y;
-  stay.x = (x.x*x.x+y.x*y.x) <= static_cast<float>(4.0f);
-  stay.y = (x.y*x.y+y.y*y.y) <= static_cast<float>(4.0f);
-  stay.z = (x.z*x.z+y.z*y.z) <= static_cast<float>(4.0f);
-  stay.w = (x.w*x.w+y.w*y.w) <= static_cast<float>(4.0f);
-  for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter);
-  iter+=16) {
+  stay.x = (x.x * x.x + y.x * y.x) <= static_cast<float>(4.0f);
+  stay.y = (x.y * x.y + y.y * y.y) <= static_cast<float>(4.0f);
+  stay.z = (x.z * x.z + y.z * y.z) <= static_cast<float>(4.0f);
+  stay.w = (x.w * x.w + y.w * y.w) <= static_cast<float>(4.0f);
+  for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter); iter += 16) {
    x = savx;
    y = savy;
    // Two iterations
-    tmp = x*x + x0 - y*y;
+    tmp = x * x + x0 - y * y;
    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
+    x = tmp * tmp + x0 - y * y;
    y = 2.0f * tmp * y + y0;
    // Two iterations
-    tmp = x*x + x0 - y*y;
+    tmp = x * x + x0 - y * y;
    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
+    x = tmp * tmp + x0 - y * y;
    y = 2.0f * tmp * y + y0;
    // Two iterations
-    tmp = x*x + x0 - y*y;
+    tmp = x * x + x0 - y * y;
    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
+    x = tmp * tmp + x0 - y * y;
    y = 2.0f * tmp * y + y0;
    // Two iterations
-    tmp = x*x + x0 - y*y;
+    tmp = x * x + x0 - y * y;
    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
+    x = tmp * tmp + x0 - y * y;
    y = 2.0f * tmp * y + y0;
    // Two iterations
-    tmp = x*x + x0 - y*y;
+    tmp = x * x + x0 - y * y;
    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
+    x = tmp * tmp + x0 - y * y;
    y = 2.0f * tmp * y + y0;
    // Two iterations
-    tmp = x*x + x0 - y*y;
+    tmp = x * x + x0 - y * y;
    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
+    x = tmp * tmp + x0 - y * y;
    y = 2.0f * tmp * y + y0;
    // Two iterations
-    tmp = x*x + x0 - y*y;
+    tmp = x * x + x0 - y * y;
    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
+    x = tmp * tmp + x0 - y * y;
    y = 2.0f * tmp * y + y0;
-    stay.x = (x.x*x.x+y.x*y.x) <= static_cast<float>(4.0f);
-    stay.y = (x.y*x.y+y.y*y.y) <= static_cast<float>(4.0f);
-    stay.z = (x.z*x.z+y.z*y.z) <= static_cast<float>(4.0f);
-    stay.w = (x.w*x.w+y.w*y.w) <= static_cast<float>(4.0f);
+    stay.x = (x.x * x.x + y.x * y.x) <= static_cast<float>(4.0f);
+    stay.y = (x.y * x.y + y.y * y.y) <= static_cast<float>(4.0f);
+    stay.z = (x.z * x.z + y.z * y.z) <= static_cast<float>(4.0f);
+    stay.w = (x.w * x.w + y.w * y.w) <= static_cast<float>(4.0f);
    savx.x = static_cast<bool>(stay.x ? x.x : savx.x);
    savx.y = static_cast<bool>(stay.y ? x.y : savx.y);
    savx.z = static_cast<bool>(stay.z ? x.z : savx.z);
@@ -135,10 +134,10 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
    savy.y = static_cast<bool>(stay.y ? y.y : savy.y);
    savy.z = static_cast<bool>(stay.z ? y.z : savy.z);
    savy.w = static_cast<bool>(stay.w ? y.w : savy.w);
-    ccount.x -= stay.x*16;
-    ccount.y -= stay.y*16;
-    ccount.z -= stay.z*16;
-    ccount.w -= stay.w*16;
+    ccount.x -= stay.x * 16;
+    ccount.y -= stay.y * 16;
+    ccount.z -= stay.z * 16;
+    ccount.w -= stay.w * 16;
  }
  // Handle remainder
  if (!(stay.x & stay.y & stay.z & stay.w)) {
@@ -146,13 +145,13 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
    do {
      x = savx;
      y = savy;
-      stay.x = ((x.x*x.x+y.x*y.x) <= 4.0f) && (ccount.x <  maxIter);
-      stay.y = ((x.y*x.y+y.y*y.y) <= 4.0f) && (ccount.y <  maxIter);
-      stay.z = ((x.z*x.z+y.z*y.z) <= 4.0f) && (ccount.z <  maxIter);
-      stay.w = ((x.w*x.w+y.w*y.w) <= 4.0f) && (ccount.w <  maxIter);
+      stay.x = ((x.x * x.x + y.x * y.x) <= 4.0f) && (ccount.x < maxIter);
+      stay.y = ((x.y * x.y + y.y * y.y) <= 4.0f) && (ccount.y < maxIter);
+      stay.z = ((x.z * x.z + y.z * y.z) <= 4.0f) && (ccount.z < maxIter);
+      stay.w = ((x.w * x.w + y.w * y.w) <= 4.0f) && (ccount.w < maxIter);
      tmp = x;
-      x = x*x + x0 - y*y;
-      y = 2.0f*tmp*y + y0;
+      x = x * x + x0 - y * y;
+      y = 2.0f * tmp * y + y0;
      ccount.x += stay.x;
      ccount.y += stay.y;
      ccount.z += stay.z;
@@ -168,7 +167,7 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
      savy.w = (stay.w ? y.w : savy.w);
    } while ((stay.x | stay.y | stay.z | stay.w) && iter);
  }
-  uint4 *vecOut = reinterpret_cast<uint4 *>(out);
+  uint4* vecOut = reinterpret_cast<uint4*>(out);
  vecOut[tid].x = (uint)(ccount.x);
  vecOut[tid].y = (uint)(ccount.y);
  vecOut[tid].z = (uint)(ccount.z);
@@ -180,27 +179,19 @@ class hipPerfStreamConcurrency {
  hipPerfStreamConcurrency();
  ~hipPerfStreamConcurrency();

-  void setNumKernels(unsigned int num) {
-    numKernels = num;
-  }
-  void setNumStreams(unsigned int num) {
-    numStreams = num;
-  }
-  unsigned int getNumStreams() {
-    return numStreams;
-  }
+  void setNumKernels(unsigned int num) { numKernels = num; }
+  void setNumStreams(unsigned int num) { numStreams = num; }
+  unsigned int getNumStreams() { return numStreams; }

-  unsigned int getNumKernels() {
-    return numKernels;
-  }
+  unsigned int getNumKernels() { return numKernels; }

  bool open(int deviceID);
  bool run(unsigned int testCase, unsigned int deviceId);
  void close(void);

 private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
+  void setData(void* ptr, unsigned int value);
+  void checkData(uint* ptr);

  unsigned int numKernels;
  unsigned int numStreams;
@@ -227,38 +218,34 @@ bool hipPerfStreamConcurrency::open(int deviceId) {
  HIP_CHECK(hipSetDevice(deviceId));
  hipDeviceProp_t props;
  HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID
-    << " " << props.name << " with " << props.multiProcessorCount << " CUs"
-     << " and device id: " << deviceId  << std::endl;
+  CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device ID: %d", props.pciBusID,
+                props.name, props.multiProcessorCount, deviceId);
+
  numCUs = props.multiProcessorCount;
  return true;
 }

-void hipPerfStreamConcurrency::close() {
-}
+void hipPerfStreamConcurrency::close() {}

-bool hipPerfStreamConcurrency::run(unsigned int testCase,
-                               unsigned int deviceId) {
+bool hipPerfStreamConcurrency::run(unsigned int testCase, unsigned int deviceId) {
  int clkFrequency = 0;
  unsigned int numStreams = getNumStreams();
  unsigned int numKernels = getNumKernels();

-  HIP_CHECK(hipDeviceGetAttribute(&clkFrequency,
-             hipDeviceAttributeClockRate, deviceId));
+  HIP_CHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, deviceId));
  if (clkFrequency == 0) {
-    std::cout << "clkFrequency = 0, set it to 1000000\n";
+    CONSOLE_PRINT("clkFrequency = 0, set it to 1000000\n");
    clkFrequency = 1000000;
  }
-  clkFrequency =(unsigned int)clkFrequency/1000;
+  clkFrequency = (unsigned int)clkFrequency / 1000;

  // Maximum iteration count
  // maxIter = 8388608 * (engine_clock / 1000).serial execution
-  maxIter = (unsigned int)(((8388608 * (static_cast<float>(clkFrequency) / 1000))
-                                                        * numCUs) / 128);
+  maxIter = (unsigned int)(((8388608 * (static_cast<float>(clkFrequency) / 1000)) * numCUs) / 128);
  maxIter = (maxIter + 15) & ~15;
-  hipStream_t *streams = new hipStream_t[numStreams];
-  uint ** hPtr = new uint*[numKernels];
-  uint ** dPtr = new uint*[numKernels];
+  hipStream_t* streams = new hipStream_t[numStreams];
+  uint** hPtr = new uint*[numKernels];
+  uint** dPtr = new uint*[numKernels];

  // Width is divisible by 4 because the mandelbrot kernel
  // processes 4 pixels at once.
@@ -271,16 +258,15 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,

  // Allocate memory on the host and device
  for (uint i = 0; i < numKernels; i++) {
-    HIP_CHECK(hipHostMalloc(reinterpret_cast<void **>(&hPtr[i]),
-                            bufSize, hipHostMallocDefault));
+    HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&hPtr[i]), bufSize, hipHostMallocDefault));
    setData(hPtr[i], 0xdeadbeef);
-    HIP_CHECK(hipMalloc(reinterpret_cast<void **>(&dPtr[i]), bufSize))
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dPtr[i]), bufSize))
  }

  // Prepare kernel launch parameters
-  int threads = (bufSize/sizeof(uint));
-  int threads_per_block  = 64;
-  int blocks = (threads/threads_per_block) + (threads % threads_per_block);
+  int threads = (bufSize / sizeof(uint));
+  int threads_per_block = 64;
+  int blocks = (threads / threads_per_block) + (threads % threads_per_block);
  coordIdx = testCase % numCoords;
  float xStep = static_cast<float>(coords[coordIdx].width / static_cast<double>(width_));
  float yStep = static_cast<float>(-coords[coordIdx].width / static_cast<double>(width_));
@@ -289,8 +275,8 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,

  // Copy memory asynchronously and concurrently from host to device
  for (uint i = 0; i < numKernels; i++) {
-    HIP_CHECK(hipMemcpyHtoDAsync(reinterpret_cast<hipDeviceptr_t>(dPtr[i]),
-                                 hPtr[i], bufSize, streams[i % numStreams]));
+    HIP_CHECK(hipMemcpyHtoDAsync(reinterpret_cast<hipDeviceptr_t>(dPtr[i]), hPtr[i], bufSize,
+                                 streams[i % numStreams]));
  }

  // Synchronize to make sure all the copies are completed
@@ -305,9 +291,8 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,
  auto all_start = std::chrono::steady_clock::now();

  for (uint i = 0; i < numKernels; i++) {
-    hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block),
-    0, streams[i%numStreams], dPtr[i], width_, xPos, yPos, xStep,
-     yStep, maxIter);
+    hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0,
+                       streams[i % numStreams], dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter);
  }

  // Synchronize all the concurrent streans to have completed execution
@@ -320,17 +305,16 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,

  // Copy data back from device to the host
  for (uint i = 0; i < numKernels; i++) {
-    HIP_CHECK(hipMemcpyDtoHAsync(hPtr[i],
-     reinterpret_cast<hipDeviceptr_t>(dPtr[i]), bufSize,
-      streams[i % numStreams]));
+    HIP_CHECK(hipMemcpyDtoHAsync(hPtr[i], reinterpret_cast<hipDeviceptr_t>(dPtr[i]), bufSize,
+                                 streams[i % numStreams]));
  }

  if (testCase != 0) {
-  std::cout <<"Measured time for " << numKernels <<" kernels (s) on "
-  << numStreams <<" stream (s): " << all_kernel_time.count() << std::endl;
+    CONSOLE_PRINT("Measured time for %d kernels (s) on %d stream(s): %e\n", numKernels, numStreams,
+                  all_kernel_time.count());
  }

-  for (uint i = 0 ; i < numStreams; i++) {
+  for (uint i = 0; i < numStreams; i++) {
    HIP_CHECK(hipStreamDestroy(streams[i]));
  }

@@ -340,20 +324,20 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,
    HIP_CHECK(hipFree(dPtr[i]));
  }

-  delete [] streams;
-  delete [] hPtr;
-  delete [] dPtr;
+  delete[] streams;
+  delete[] hPtr;
+  delete[] dPtr;
  return true;
 }

-void hipPerfStreamConcurrency::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
-  for (unsigned int i = 0; i < width_ ; i++) {
-      ptr2[i] = value;
+void hipPerfStreamConcurrency::setData(void* ptr, unsigned int value) {
+  unsigned int* ptr2 = (unsigned int*)ptr;
+  for (unsigned int i = 0; i < width_; i++) {
+    ptr2[i] = value;
  }
 }

-void hipPerfStreamConcurrency::checkData(uint *ptr) {
+void hipPerfStreamConcurrency::checkData(uint* ptr) {
  totalIters = 0;
  for (unsigned int i = 0; i < width_; i++) {
    totalIters += ptr[i];
@@ -361,16 +345,16 @@ void hipPerfStreamConcurrency::checkData(uint *ptr) {
 }

 /**
-* Test Description
-* ------------------------
-*  - Verify the different levels of stream concurrency.
-* Test source
-* ------------------------
-*  - perftests/stream/hipPerfStreamConcurrency.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify the different levels of stream concurrency.
+ * Test source
+ * ------------------------
+ *  - perftests/stream/hipPerfStreamConcurrency.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */

 TEST_CASE("Perf_hipPerfStreamConcurrency") {
  hipPerfStreamConcurrency streamConcurrency;
@@ -386,10 +370,10 @@ TEST_CASE("Perf_hipPerfStreamConcurrency") {
        break;

      case 1:
-      // default stream executes serially
-      streamConcurrency.setNumStreams(1);
-      streamConcurrency.setNumKernels(1);
-      break;
+        // default stream executes serially
+        streamConcurrency.setNumStreams(1);
+        streamConcurrency.setNumKernels(1);
+        break;

      case 2:
        // 2-way concurrency
@@ -419,6 +403,6 @@ TEST_CASE("Perf_hipPerfStreamConcurrency") {
 }

 /**
-* End doxygen group perfComputeTest.
-* @}
-*/
+ * End doxygen group perfComputeTest.
+ * @}
+ */
@@ -18,19 +18,17 @@
 */

 /**
-* @addtogroup hipPerfStreamCreateCopyDestroy hipPerfStreamCreateCopyDestroy
-* @{
-* @ingroup perfStreamTest
-* `hipError_t hipStreamCreate(hipStream_t* stream)` -
-* Create an asynchronous stream.
-*/
+ * @addtogroup hipPerfStreamCreateCopyDestroy hipPerfStreamCreateCopyDestroy
+ * @{
+ * @ingroup perfStreamTest
+ * `hipError_t hipStreamCreate(hipStream_t* stream)` -
+ * Create an asynchronous stream.
+ */

 #include <hip_test_kernels.hh>
 #include <hip_test_checkers.hh>
 #include <hip_test_common.hh>

-using namespace std;
-
 #define BufSize 0x1000
 #define Iterations 0x100
 #define TotalStreams 4
@@ -39,17 +37,20 @@ using namespace std;

 class hipPerfStreamCreateCopyDestroy {
 private:
-    unsigned int numBuffers_;
-    unsigned int numStreams_;
-    const size_t totalStreams_[TotalStreams];
-    const size_t totalBuffers_[TotalBufs];
+  unsigned int numBuffers_;
+  unsigned int numStreams_;
+  const size_t totalStreams_[TotalStreams];
+  const size_t totalBuffers_[TotalBufs];
+
 public:
-    hipPerfStreamCreateCopyDestroy() : numBuffers_(0), numStreams_(0),
-                                       totalStreams_{1, 2, 4, 8},
-                                       totalBuffers_{1, 100, 1000, 5000} {};
-    ~hipPerfStreamCreateCopyDestroy() {};
-    bool open(int deviceID);
-    bool run(unsigned int testNumber);
+  hipPerfStreamCreateCopyDestroy()
+      : numBuffers_(0),
+        numStreams_(0),
+        totalStreams_{1, 2, 4, 8},
+        totalBuffers_{1, 100, 1000, 5000} {};
+  ~hipPerfStreamCreateCopyDestroy(){};
+  bool open(int deviceID);
+  bool run(unsigned int testNumber);
 };

 bool hipPerfStreamCreateCopyDestroy::open(int deviceId) {
@@ -61,20 +62,20 @@ bool hipPerfStreamCreateCopyDestroy::open(int deviceId) {
  HIP_CHECK(hipSetDevice(deviceId));
  hipDeviceProp_t props;
  HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID
-  << " " << props.name << " with " << props.multiProcessorCount << " CUs"
-  << " and device id: " << deviceId  << std::endl;
+
+  CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device id: %d\n", props.pciBusID,
+                props.name, props.multiProcessorCount, deviceId);
  return true;
 }

 bool hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) {
  numStreams_ = totalStreams_[testNumber % TotalStreams];
-  size_t iter = Iterations / (numStreams_ * (static_cast<size_t>(1)
-                 << (testNumber / TotalBufs + 1)));
-  hipStream_t *streams = new hipStream_t[numStreams_];
+  size_t iter =
+      Iterations / (numStreams_ * (static_cast<size_t>(1) << (testNumber / TotalBufs + 1)));
+  hipStream_t* streams = new hipStream_t[numStreams_];

  numBuffers_ = totalBuffers_[testNumber / TotalBufs];
-  float ** dSrc = new float*[numBuffers_];
+  float** dSrc = new float*[numBuffers_];
  size_t nBytes = BufSize * sizeof(float);

  for (size_t b = 0; b < numBuffers_; ++b) {
@@ -97,8 +98,7 @@ bool hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) {

    for (size_t s = 0; s < numStreams_; ++s) {
      for (size_t b = 0; b < numBuffers_; ++b) {
-        HIP_CHECK(hipMemcpyWithStream(dSrc[b], hSrc, nBytes,
-                  hipMemcpyHostToDevice, streams[s]));
+        HIP_CHECK(hipMemcpyWithStream(dSrc[b], hSrc, nBytes, hipMemcpyHostToDevice, streams[s]));
      }
    }

@@ -112,31 +112,31 @@ bool hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) {

  auto time = static_cast<float>(diff.count() * 1000 / (iter * numStreams_));

-  cout << "Create+Copy+Destroy time for " << numStreams_ << " streams and "
-       << setw(4) << numBuffers_ << " buffers " << " and " << setw(4)
-       << iter << " iterations " << time << " (ms) " << endl;
+  CONSOLE_PRINT(
+      "Create+Copy+Destroy time for %u streams and %u buffers and %zu iterations %.6f (ms)\n",
+      numStreams_, numBuffers_, iter, time);

-  delete [] hSrc;
+  delete[] hSrc;
  for (size_t b = 0; b < numBuffers_; ++b) {
    HIP_CHECK(hipFree(dSrc[b]));
  }

-  delete [] streams;
-  delete [] dSrc;
+  delete[] streams;
+  delete[] dSrc;
  return true;
 }

 /**
-* Test Description
-* ------------------------
-*  - Verify the Create+Copy+Destroy time for different stream.
-* Test source
-* ------------------------
-*  - perftests/stream/hipPerfDeviceConcurrency.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify the Create+Copy+Destroy time for different stream.
+ * Test source
+ * ------------------------
+ *  - perftests/stream/hipPerfDeviceConcurrency.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */

 TEST_CASE("Perf_hipPerfStreamCreateCopyDestroy") {
  hipPerfStreamCreateCopyDestroy streamCCD;
@@ -149,6 +149,6 @@ TEST_CASE("Perf_hipPerfStreamCreateCopyDestroy") {
 }

 /**
-* End doxygen group perfStreamTest.
-* @}
-*/
+ * End doxygen group perfStreamTest.
+ * @}
+ */
@@ -1,382 +0,0 @@
-/*
- Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-#include <vector>
-
-#define DOT_DIM 256
-
-using namespace std;
-
-template <unsigned int BLOCKSIZE>
-__launch_bounds__(BLOCKSIZE)
-__global__ void vectors_not_equal(int n,
-                                 const double* __restrict__ x,
-                                 const double* __restrict__ y,
-                                 double* __restrict__ workspace) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  double sum = 0.0;
-  for(int idx = gid; idx < n; idx += hipGridDim_x * hipBlockDim_x) {
-    sum = fma(y[idx], x[idx], sum);
-  }
-
-  __shared__ double sdata[BLOCKSIZE];
-  sdata[threadIdx.x] = sum;
-
-  __syncthreads();
-
-  if(threadIdx.x < 128) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 128];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 64){
-    sdata[threadIdx.x] += sdata[threadIdx.x + 64];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 32){
-    sdata[threadIdx.x] += sdata[threadIdx.x + 32];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 16) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 16];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 8) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 8];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 4) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 4];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 2) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 2];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 1) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 1];
-  }
-
-  if(threadIdx.x == 0) {
-    workspace[blockIdx.x] = sdata[0];
-  }
-
-}
-
-template <unsigned int BLOCKSIZE>
-__launch_bounds__(BLOCKSIZE)
-__global__ void vectors_equal(int n, const double* __restrict__ x,
-                                  double* __restrict__ workspace) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  double sum = 0.0;
-  for(int idx = gid; idx < n; idx += hipGridDim_x * blockDim.x) {
-    sum = fma(x[idx], x[idx], sum);
-  }
-
-  __shared__ double sdata[BLOCKSIZE];
-  sdata[threadIdx.x] = sum;
-
-  __syncthreads();
-
-  if(threadIdx.x < 128) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 128];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 64) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 64];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 32) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 32];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 16) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 16];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 8) {
-          sdata[threadIdx.x] += sdata[threadIdx.x + 8];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 4) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 4];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 2) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 2];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 1) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 1];
-  }
-
-  if(threadIdx.x == 0) {
-    workspace[blockIdx.x] = sdata[0];
-    }
-}
-
-template <unsigned int BLOCKSIZE>
-__launch_bounds__(BLOCKSIZE)
-__global__ void dot_reduction(double* __restrict__ workspace) {
-
-  __shared__ double sdata[BLOCKSIZE];
-
-  sdata[threadIdx.x] = workspace[threadIdx.x];
-
-  __syncthreads();
-
-  if(threadIdx.x < 128) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 128];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 64) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 64];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 32) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 32];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 16) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 16];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 8) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 8];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 4) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 4];
-  } __syncthreads();
-
-  if(threadIdx.x < 2) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 2];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 1) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 1];
-  }
-
-  if(threadIdx.x == 0) {
-    workspace[0] = sdata[0];
-  }
-
-}
-
-void computeDotProduct(int n, const double* x, const double* y, double& result,
-                      double* workspace)
-{
-  dim3 blocks(DOT_DIM);
-  dim3 threadsPerBlock(DOT_DIM);
-
-  if(x != y) {
-    hipLaunchKernelGGL(vectors_not_equal<DOT_DIM>, blocks, threadsPerBlock, 0, 0, n, x, y,
-                       workspace);
-  }
-  else {
-    hipLaunchKernelGGL(vectors_equal<DOT_DIM>, blocks, threadsPerBlock, 0, 0, n, x, workspace);
-  }
-
-  // Part 2 of dot product computation
-  hipLaunchKernelGGL(dot_reduction<DOT_DIM>, dim3(1), threadsPerBlock, 0, 0, workspace);
-
-  // Copy the final dot product result back from the device
-  HIPCHECK(hipMemcpy(&result, workspace, sizeof(double), hipMemcpyDeviceToHost));
-
-  return;
-}
-
-int main(int argc, char* argv[]) {
-
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-
-  if (nGpu < 1) {
-    failed("No GPU!");
-  }
-  hipDeviceProp_t props = {0};
-  props = {0};
-  HIPCHECK(hipSetDevice(p_gpuDevice));
-  HIPCHECK(hipGetDeviceProperties(&props, p_gpuDevice));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-    << " with " << props.multiProcessorCount << " CUs" << " and device id: " << p_gpuDevice
-    << std::endl;
-
-  int nx, ny, nz;
-
-  for (unsigned int testCase = 0; testCase < 3; testCase++) {
-
-  vector<int> vectorSize = {200, 300, 50};
-  switch(testCase) {
-
-  case 0:
-  nx = vectorSize[0];
-  ny = vectorSize[0];
-  nz = vectorSize[0];
-  break;
-
-  case 1:
-  nx = vectorSize[1];
-  ny = vectorSize[1];
-  nz = vectorSize[1];
-  break;
-
-  case 2:
-  nx = vectorSize[0];
-  ny = vectorSize[1];
-  nz = vectorSize[2];
-  break;
-
-  default:
-    break;
-
-  }
-
-  int trials = 200;
-
-  int size = nx * ny * nz;
-
-  vector<double> hx(size);
-  vector<double> hy(size);
-  double hresult_xy = 0.0;
-  double hresult_xx = 0.0;
-
-  srand(time(NULL));
-
-  for(int i = 0; i < size; ++i) {
-    hx[i] = 2.0 * (double)rand() / (double)RAND_MAX - 1.0;
-    hy[i] = 2.0 * (double)rand() / (double)RAND_MAX - 1.0;
-
-    hresult_xy += hx[i] * hy[i];
-    hresult_xx += hx[i] * hx[i];
-  }
-
-  double* dx;
-  double* dy;
-  double* workspace;
-  double  dresult;
-
-  HIPCHECK(hipMalloc((void**)&dx, sizeof(double) * size));
-  HIPCHECK(hipMalloc((void**)&dy, sizeof(double) * size));
-  HIPCHECK(hipMalloc((void**)&workspace, sizeof(double) * DOT_DIM));
-
-  HIPCHECK(hipMemcpy(dx, hx.data(), sizeof(double) * size, hipMemcpyHostToDevice));
-  HIPCHECK(hipMemcpy(dy, hy.data(), sizeof(double) * size, hipMemcpyHostToDevice));
-
-  // Warm up
-  computeDotProduct(size, dx, dy, dresult, workspace);
-  computeDotProduct(size, dx, dy, dresult, workspace);
-  computeDotProduct(size, dx, dy, dresult, workspace);
-
-  // Timed run for <x,y>
-  HIPCHECK(hipDeviceSynchronize());
-  auto all_start = std::chrono::steady_clock::now();
-
-  for(int i = 0; i < trials; ++i) {
-    computeDotProduct(size, dx, dy, dresult, workspace);
-  }
-
-  float time = 0;
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-  time = all_kernel_time.count();
-
-  time /= trials;
-
-  double bw = sizeof(double) * size * 2.0 / 1e9;
-  double gf = 2.0 * size / 1e9;
-
-  cout << "\nVector Size: " << size << "\n[ddot] <x,y> " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;"
-       << gf/(time / 1e3) << " GFlop/s" << endl;
-
-  // Verify the device kernel results comparing it with the host results
-  if(std::abs(dresult - hresult_xy) > std::max(dresult * 1e-10, 1e-8)) {
-    cerr << " Device results inconsistent with host results. "
-         << " Host result: " << hresult_xy
-         << " Device result: " << dresult;
-  }
-
-  // Warm up
-  computeDotProduct(size, dx, dx, dresult, workspace);
-  computeDotProduct(size, dx, dx, dresult, workspace);
-  computeDotProduct(size, dx, dx, dresult, workspace);
-
-  // Timed run for <x,x>
-  HIPCHECK(hipDeviceSynchronize());
-  all_start = std::chrono::steady_clock::now();
-
-  for(int i = 0; i < trials; ++i) {
-    computeDotProduct(size, dx, dx, dresult, workspace);
-  }
-
-  all_end = std::chrono::steady_clock::now();
-  all_kernel_time = all_end - all_start;
-  time = all_kernel_time.count();
-
-  time /= trials;
-  bw = sizeof(double) * size / 1e9;
-
-  cout << "[ddot] <x,y> " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;"
-       << gf/(time / 1e3) << " GFlop/s" << endl;
-
-  // Verify the device kernel results comparing it with the host results
-  if(abs(dresult - hresult_xx) > max(dresult * 1e-10, 1e-8)) {
-    cerr << " Device results inconsistent with host results"
-         << " Host result: " << hresult_xy
-         << " Device result: " << dresult;
-  }
-
-  HIPCHECK(hipFree(dx));
-  HIPCHECK(hipFree(dy));
-  HIPCHECK(hipFree(workspace));
-
-  }
-  passed();
-  return 0;
-}
@@ -1,743 +0,0 @@
-/*
- Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-#include <hip/hip_vector_types.h>
-#include <hip/math_functions.h>
-#include <vector>
-#include <string>
-#include <map>
-
-typedef struct {
-  double x;
-  double y;
-  double width;
-} coordRec;
-
-coordRec coords[] = {
-    {0.0, 0.0, 4.0},                                     // Whole set
-    {0.0, 0.0, 0.00001},                                 // All black
-    {-0.0180789661868, 0.6424294066162, 0.00003824140},  // Hit detail
-};
-
-static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
-
-template <typename T>
-__global__ void float_mad_kernel(uint *out, uint width, T xPos,  T yPos, T xStep, T yStep,
-                                  uint maxIter) {
-
-#pragma FP_CONTRACT ON
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % width;
-  int j = tid / width;
-  float x0 = (float)(xPos + xStep*i);
-  float y0 = (float)(yPos + yStep*j);
-
-  float x = x0;
-  float y = y0;
-
-  uint iter = 0;
-  float tmp;
-  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
-    tmp = x;
-    x = fma(-y,y,fma(x,x,x0));
-    y = fma(2.0f*tmp,y,y0);
-  }
-
-  out[tid] = iter;
-};
-
-template <typename T>
-__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
-    T yPos, T xStep, T yStep, uint maxIter) {
-
-#pragma FP_CONTRACT ON
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % width;
-  int j = tid / width;
-  float x0 = (float)(xPos + xStep*(float)i);
-  float y0 = (float)(yPos + yStep*(float)j);
-
-  float x = x0;
-  float y = y0;
-
-#define FAST
-  uint iter = 0;
-  float tmp;
-  int stay;
-  int ccount = 0;
-  stay = (x*x+y*y) <= 4.0;
-  float savx = x;
-  float savy = y;
-#ifdef FAST
-  for (iter = 0; (iter < maxIter); iter+=16) {
-#else
-  for (iter = 0; stay && (iter < maxIter); iter+=16) {
-#endif
-    x = savx;
-    y = savy;
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    stay = (x*x+y*y) <= 4.0;
-    savx = (stay ? x : savx);
-    savy = (stay ? y : savy);
-    ccount += stay*16;
-#ifdef FAST
-    if (!stay)
-      break;
-#endif
-  }
-  // Handle remainder
-  if (!stay) {
-    iter = 16;
-    do {
-      x = savx;
-      y = savy;
-      stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
-      tmp = x;
-      x =  fma(-y,y, fma(x,x,x0));
-      y =  fma(2.0f*tmp,y,y0);
-      ccount += stay;
-      iter--;
-      savx = (stay ? x : savx);
-      savy = (stay ? y : savy);
-    } while (stay && iter);
-  }
-
-
-  out[tid] = (uint)ccount;
-
-};
-
-
-template <typename T>
-__global__ void double_mad_kernel(uint *out, uint width, T xPos,  T yPos, T xStep, T yStep,
-                                   uint maxIter) {
-
-#pragma FP_CONTRACT ON
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % width;
-  int j = tid / width;
-  double x0 = (double)(xPos + xStep*i);
-  double y0 = (double)(yPos + yStep*j);
-
-  double x = x0;
-  double y = y0;
-
-  uint iter = 0;
-  double tmp;
-  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
-    tmp = x;
-    x = fma(-y,y,fma(x,x,x0));
-    y = fma(2.0f*tmp,y,y0);
-  }
-  out[tid] = iter;
-};
-
-
-template <typename T>
-__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
-                  T yPos, T xStep, T yStep, uint maxIter) {
-
-#pragma FP_CONTRACT ON
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-
-  int i = tid % width;
-  int j = tid / width;
-  double x0 = (double)(xPos + xStep*(double)i);
-  double y0 = (double)(yPos + yStep*(double)j);
-
-  double x = x0;
-  double y = y0;
-
-#define FAST
-  uint iter = 0;
-  double tmp;
-  int stay;
-  int ccount = 0;
-  stay = (x*x+y*y) <= 4.0;
-  double savx = x;
-  double savy = y;
-#ifdef FAST
-  for (iter = 0; (iter < maxIter); iter+=16)
-#else
-  for (iter = 0; stay && (iter < maxIter); iter+=16)
-#endif
-  {
-    x = savx;
-    y = savy;
-
-    // Two iterations
-    tmp = fma(-y,y, fma(x,x,x0));
-    y =   fma(2.0f*x,y,y0);
-    x =   fma(-y,y, fma(tmp,tmp,x0));
-    y =   fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp = fma(-y,y, fma(x,x,x0));
-    y =   fma(2.0f*x,y,y0);
-    x =   fma(-y,y, fma(tmp,tmp,x0));
-    y =   fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp = fma(-y,y, fma(x,x,x0));
-    y =   fma(2.0f*x,y,y0);
-    x =   fma(-y,y, fma(tmp,tmp,x0));
-    y =   fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    stay = (x*x+y*y) <= 4.0;
-    savx = (stay ? x : savx);
-    savy = (stay ? y : savy);
-    ccount += stay*16;
-#ifdef FAST
-    if (!stay)
-      break;
-#endif
-    }
-  // Handle remainder
-    if (!stay) {
-      iter = 16;
-      do {
-        x = savx;
-        y = savy;
-        stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
-        tmp = x;
-        x =  fma(-y,y, fma(x,x,x0));
-        y =  fma(2.0f*tmp,y,y0);
-        ccount += stay;
-        iter--;
-        savx = (stay ? x : savx);
-        savy = (stay ? y : savy);
-      }
-      while (stay && iter);
-
-    }
-    out[tid] = (uint)ccount;
-};
-
-static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15;
-
-// Expected results for each kernel run at each coord
-unsigned long long expectedIters[] = {
-    203277748ull,  2147483648ull, 120254651ull,  203277748ull,  2147483648ull,
-    120254651ull,  203277748ull,  2147483648ull, 120254651ull,  203315114ull,
-    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull,
-    203280620ull,  2147483648ull, 120485704ull,  203280620ull,  2147483648ull,
-    120485704ull,  203280620ull,  2147483648ull, 120485704ull,  203315114ull,
-    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull};
-
-class hipPerfMandelBrot {
-  public:
-  hipPerfMandelBrot();
-  ~hipPerfMandelBrot();
-
-  void setNumKernels(unsigned int num) {
-    numKernels = num;
-  }
-
-  unsigned int getNumKernels() {
-    return numKernels;
-  }
-
-  void setNumStreams(unsigned int num) {
-    numStreams = num;
-  }
-  unsigned int getNumStreams() {
-    return numStreams;
-  }
-
-  void open(int deviceID);
-  void run(unsigned int testCase, unsigned int deviceId);
-  void printResults(void);
-
-  // array of funtion pointers
-  typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos,  float yPos,
-                 float xStep, float yStep, uint maxIter,  hipStream_t* streams, int blocks,
-                 int threads_per_block, int kernelCnt);
-
-  // Wrappers
-  void float_mad(uint *out, uint width, float xPos,  float yPos,
-                  float xStep, float yStep, uint maxIter, hipStream_t* streams,
-                  int blocks, int threads_per_block, int kernelCnt);
-
-  void float_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
-                            float xStep, float yStep, uint maxIter, hipStream_t* streams,
-                            int blocks, int threads_per_block, int kernelCnt);
-
-  void double_mad(uint *out, uint width, float xPos,  float yPos, float xStep,
-                   float yStep, uint maxIter, hipStream_t* streams, int blocks,
-                   int threads_per_block, int kernelCnt);
-
-  void double_mandel_unroll(uint *out, uint width, float xPos,  float yPos, float xStep,
-                             float yStep, uint maxIter, hipStream_t* streams, int blocks,
-                             int threads_per_block, int kernelCnt);
-
-  hipStream_t streams[2];
-
-  private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
-
-  unsigned int numKernels;
-  unsigned int numStreams;
-
-  std::map<std::string, std::vector<double>> results;
-  unsigned int width_;
-  unsigned int bufSize;
-  unsigned int maxIter;
-  unsigned int coordIdx;
-  volatile unsigned long long totalIters = 0;
-  int numCUs;
-  static const unsigned int numLoops = 10;
-};
-
-
-hipPerfMandelBrot::hipPerfMandelBrot() {}
-
-hipPerfMandelBrot::~hipPerfMandelBrot() {}
-
-void hipPerfMandelBrot::open(int deviceId) {
-
-
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  if (nGpu < 1) {
-    failed("No GPU!");
-  }
-
-
-  HIPCHECK(hipSetDevice(deviceId));
-  hipDeviceProp_t props = {0};
-  HIPCHECK(hipGetDeviceProperties(&props, deviceId));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-    << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
-    << std::endl;
-
-  numCUs = props.multiProcessorCount;
-}
-
-
-void hipPerfMandelBrot::printResults() {
-
-  int numkernels = getNumKernels();
-  int numStreams = getNumStreams();
-
-  std::cout << "\n" <<"Measured perf for kernels in GFLOPS on "
-            << numStreams << " streams (s)" <<  std::endl;
-
-  std::map<std::string, std::vector<double>>:: iterator itr;
-  for (itr = results.begin(); itr != results.end(); itr++) {
-          std::cout << "\n" << std::setw(20) << itr->first << " ";
-          for(auto i : results[itr->first]) {
-            std::cout << std::setw(10) << i << " ";
-            }
-     }
-  results.clear();
-
-  std::cout << std::endl;
-}
-
-
-// Wrappers for the kernel launches
-void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos,  float yPos, float xStep,
-                                   float yStep, uint maxIter, hipStream_t* streams,
-                                   int blocks, int threads_per_block, int kernelCnt) {
-
-  int streamCnt = getNumStreams();
-  hipLaunchKernelGGL(float_mad_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
-                      streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep,
-                      maxIter);
-
-
-}
-
-
-void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
-                             float xStep, float yStep, uint maxIter, hipStream_t * streams,
-                             int blocks, int threads_per_block, int kernelCnt) {
-
-  int streamCnt = getNumStreams();
-  hipLaunchKernelGGL(float_mandel_unroll_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
-                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
-
-}
-
-
-void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos,  float yPos,
-                               float xStep, float yStep, uint maxIter, hipStream_t * streams,
-                               int blocks, int threads_per_block, int kernelCnt) {
-
-  int streamCnt = getNumStreams();
-  hipLaunchKernelGGL(double_mad_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
-                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
-
-}
-
-
-void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
-                              float xStep, float yStep, uint maxIter, hipStream_t * streams,
-                              int blocks, int threads_per_block, int kernelCnt) {
-
-  int streamCnt = getNumStreams();
-  hipLaunchKernelGGL(float_mandel_unroll_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
-                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
-
-}
-
-
-void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
-
-  unsigned int numStreams = getNumStreams();
-  coordIdx = testCase % numCoords;
-
-  funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
-               &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
-
-  // Maximum iteration count
-  maxIter = 32768;
-
-  uint * hPtr[numKernels];
-  uint * dPtr[numKernels];
-
-  // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
-  width_ = 256;
-
-  bufSize = width_  * width_ * sizeof(uint);
-
-  // Create streams for concurrency
-  for (uint i = 0; i < numStreams; i++) {
-    HIPCHECK(hipStreamCreate(&streams[i]));
-  }
-
-
-  // Allocate memory on the host and device
-  for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
-    setData(hPtr[i], 0xdeadbeef);
-    HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
-  }
-
-
-  // Prepare kernel launch parameters
-  int threads = (bufSize/sizeof(uint));
-  int threads_per_block  = 64;
-  int blocks = (threads/threads_per_block) + (threads % threads_per_block);
-
-  float xStep = (float)(coords[coordIdx].width / (double)width_);
-  float yStep = (float)(-coords[coordIdx].width / (double)width_);
-  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
-  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
-
-  // Copy memory asynchronously and concurrently from host to device
-  for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
-  }
-
-  // Synchronize to make sure all the copies are completed
-  HIPCHECK(hipStreamSynchronize(0));
-
-  int kernelIdx;
-  if(testCase == 0 || testCase == 5 || testCase == 10) {
-    kernelIdx = 0;
-  }
-
-  else if(testCase == 1 || testCase == 6 || testCase == 11) {
-    kernelIdx = 1;
-  }
-  else if(testCase == 2 || testCase == 7 || testCase == 12) {
-    kernelIdx = 2;
-  }
-  else if(testCase == 3 || testCase == 8 || testCase == 13){
-    kernelIdx = 3;
-  }
-
-
-  double totalTime = 0.0;
-
-  for (unsigned int k = 0; k < numLoops; k++) {
-  if ((testCase == 0 || testCase == 1 || testCase == 2 ||
-                  testCase == 5 || testCase == 6 || testCase == 7 ||
-                  testCase == 10 || testCase == 11 || testCase == 12)) {
-  float xStep = (float)(coords[coordIdx].width / (double)width_);
-  float yStep = (float)(-coords[coordIdx].width / (double)width_);
-  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
-  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
-
-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-
-  for (uint i = 0; i < numKernels; i++) {
-    (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
-                           threads_per_block, i);
-  }
-
-
-  // Synchronize all the concurrent streams to have completed execution
-  HIPCHECK(hipStreamSynchronize(0));
-
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-  totalTime += all_kernel_time.count();
-
-  }
-
-
-  else {
-  double xStep = coords[coordIdx].width / (double)width_;
-  double yStep = -coords[coordIdx].width / (double)width_;
-  double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
-  double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
-
-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-
-  for (uint i = 0; i < numKernels; i++) {
-  (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
-                         threads_per_block, i);
-  }
-
-
-  // Synchronize all the concurrent streams to have completed execution
-  HIPCHECK(hipStreamSynchronize(0));
-
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-  totalTime += all_kernel_time.count();
-  }
-
-
-  }
-
-  // Copy data back from device to the host
-  for(uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost));
-  }
-
-
-  for(uint i = 0; i < numKernels; i++) {
-  checkData(hPtr[i]);
-
-  int j =0;
-  while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
-          j++;
-  }
-
-  if(j==30) {
-    std::cout << "Incorrect iteration count detected. ";
-  }
-
-  }
-
-
-  // Compute GFLOPS.  There are 7 FLOPs per iteration
-  double perf = ((double)(totalIters*numKernels) * 7 * (double)(1e-09)) /
-                (totalTime / (double)numLoops);
-
-
-  std::vector<std::string> kernelName = {"float", "float_unroll",
-                      "double", "double_unroll"};
-
-  // Print results except for Warm-up kernel
-  if(testCase!=100) {
-  results[kernelName[testCase % 4]].push_back(perf);
- }
-
-
-  for(uint i = 0 ; i < numStreams; i++) {
-    HIPCHECK(hipStreamDestroy(streams[i]));
-  }
-
-
-  // Free host and device memory
-  for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipHostFree(hPtr[i]));
-    HIPCHECK(hipFree(dPtr[i]));
-  }
-
-
-}
-
-
-void hipPerfMandelBrot::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
-  for (unsigned int i = 0; i < width_ * width_; i++) {
-      ptr2[i] = value;
-  }
-}
-
-
-void hipPerfMandelBrot::checkData(uint *ptr) {
-  totalIters = 0;
-  for (unsigned int i = 0; i < width_ * width_; i++) {
-    totalIters += ptr[i];
-  }
-}
-
-
-int main(int argc, char* argv[]) {
-  hipPerfMandelBrot mandelbrotCompute;
-  int deviceId = 0;
-
-  mandelbrotCompute.open(deviceId);
-
-  for (unsigned int testCase = 0; testCase < 3; testCase++) {
-
-
-  switch (testCase) {
-
-
-  case 0: {
-    // Warmup-kernel - default stream executes serially
-    mandelbrotCompute.setNumStreams(1);
-    mandelbrotCompute.setNumKernels(1);
-    mandelbrotCompute.run(100/*Random number*/, deviceId);
-    break;
-    }
-
-
-  case 1: {
-    // run all - sync
-    int i = 0;
-    do {
-    mandelbrotCompute.setNumStreams(1);
-    mandelbrotCompute.setNumKernels(1);
-    mandelbrotCompute.run(i, deviceId);
-    i++;
-    }while(i < 12);
-    mandelbrotCompute.printResults();
-
-    break;
-  }
-
-
-  case 2: {
-    // run all - async
-    int i = 0;
-    do {
-    mandelbrotCompute.setNumStreams(2);
-    mandelbrotCompute.setNumKernels(2);
-    mandelbrotCompute.run(i, deviceId);
-    i++;
-    }while(i < 12);
-    mandelbrotCompute.printResults();
-
-    break;
-
-  }
-
-
-  default: {
-    break;
-  }
-
-
-  }
-
-
-
-  }
-
-
-  passed();
-}
@@ -1,284 +0,0 @@
-/*
- Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-
-typedef struct {
-  double x;
-  double y;
-  double width;
-} coordRec;
-
-static coordRec coords[] = {
-    {0.0, 0.0, 0.00001},         // All black
-};
-
-static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
-
-__global__ void mandelbrot(uint *out, uint width, float xPos,  float yPos, float xStep,
-                            float yStep, uint maxIter) {
-
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % width;
-  int j = tid / width;
-  float x0 = (float)(xPos + xStep*i);
-  float y0 = (float)(yPos + yStep*j);
-
-  float x = x0;
-  float y = y0;
-
-  uint iter = 0;
-  float tmp;
-  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
-    tmp = x;
-    x = fma(-y,y,fma(x,x,x0));
-    y = fma(2.0f*tmp,y,y0);
-  }
-
-  out[tid] = iter;
-};
-
-class hipPerfDeviceConcurrency {
-  public:
-  hipPerfDeviceConcurrency();
-  ~hipPerfDeviceConcurrency();
-
-  void setNumGpus(unsigned int num) {
-    numDevices = num;
-  }
-  unsigned int getNumGpus() {
-    return numDevices;
-  }
-
-  void open(void);
-  void close(void);
-  void run(unsigned int testCase, int numGpus);
-
-  private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
-
-  unsigned int numDevices;
-  unsigned int width_;
-  unsigned int bufSize;
-  unsigned int coordIdx;
-  unsigned long long totalIters = 0;
-};
-
-
-hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {}
-
-hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {}
-
-void hipPerfDeviceConcurrency::open(void) {
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  setNumGpus(nGpu);
-  if (nGpu < 1) {
-    failed("No GPU!");
-  }
-}
-
-
-void hipPerfDeviceConcurrency::close() {
-}
-
-void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
-
-
-  static int deviceId;
-  uint * hPtr[numGpus];
-  uint * dPtr[numGpus];
-  hipStream_t streams[numGpus];
-  int numCUs[numGpus];
-  unsigned int maxIter[numGpus];
-  unsigned long long expectedIters[numGpus];
-
-  int threads, threads_per_block, blocks;
-  float xStep, yStep, xPos, yPos;
-
-  for(int i = 0; i < numGpus; i++) {
-
-  if(testCase != 0) {
-    deviceId = i;
-  }
-
-  HIPCHECK(hipSetDevice(deviceId));
-
-  hipDeviceProp_t props = {0};
-  HIPCHECK(hipGetDeviceProperties(&props, i));
-
-  if (testCase != 0) {
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-            << " with " << props.multiProcessorCount << " CUs" << " and device ID: "
-            << i << std::endl;
-  }
-
-  numCUs[i] = props.multiProcessorCount;
-  int clkFrequency = 0;
-  HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i));
-
-  clkFrequency =(unsigned int)clkFrequency/1000;
-
-  // Maximum iteration count
-  // maxIter = 8388608 * (engine_clock / 1000).serial execution
-  maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128);
-  maxIter[i] = (maxIter[i] + 15) & ~15;
-
-  // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
-  width_ = 256;
-
-  bufSize = width_ * width_ * sizeof(uint);
-
-  // Create streams for concurrency
-  HIPCHECK(hipStreamCreate(&streams[i]));
-
-  // Allocate memory on the host and device
-  HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
-  setData(hPtr[i], 0xdeadbeef);
-  HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
-
-  // Prepare kernel launch parameters
-  threads = (bufSize/sizeof(uint));
-  threads_per_block  = 64;
-  blocks = (threads/threads_per_block) + (threads % threads_per_block);
-
-  coordIdx = testCase % numCoords;
-  xStep = (float)(coords[coordIdx].width / (double)width_);
-  yStep = (float)(-coords[coordIdx].width / (double)width_);
-  xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
-  yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
-
-  // Copy memory from host to device
-  HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
-
-  }
-
-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-
-  for(int i = 0; i < numGpus; i++) {
-
-  if(testCase != 0) {
-    deviceId = i;
-  }
-
-  HIPCHECK(hipSetDevice(deviceId));
-
-  hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i],
-                      dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]);
-
-  }
-
-  for(int i = 0; i < numGpus; i++) {
-    HIPCHECK(hipStreamSynchronize(0));
-  }
-
-
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-
-  for(int i = 0; i < numGpus; i++) {
-
-  if(testCase != 0) {
-    deviceId = i;
-  }
-  HIPCHECK(hipSetDevice(deviceId));
-
-  // Copy data back from device to the host
-  HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
-
-  checkData(hPtr[i]);
-  expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i];
-
-  if (testCase != 0) {
-    checkData(hPtr[i]);
-    if(totalIters != expectedIters[i]) {
-      std::cout << "Incorrect iteration count detected" << std::endl;
-    }
-  }
-
-
-  HIPCHECK(hipStreamDestroy(streams[i]));
-
-  // Free host and device memory
-  HIPCHECK(hipHostFree(hPtr[i]));
-  HIPCHECK(hipFree(dPtr[i]));
-  }
-
-  if (testCase != 0) {
-  std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): "
-            << all_kernel_time.count() << " (s) " << '\n' << std::endl;
-  }
-
-  if(testCase == 0) {
-    deviceId++;
-  }
-
-
-}
-
-
-void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
-  for (unsigned int i = 0; i < width_ * width_ ; i++) {
-      ptr2[i] = value;
-  }
-}
-
-
-void hipPerfDeviceConcurrency::checkData(uint *ptr) {
-  totalIters = 0;
-  for (unsigned int i = 0; i < width_ * width_; i++) {
-    totalIters += ptr[i];
-  }
-}
-
-
-int main(int argc, char* argv[]) {
-  hipPerfDeviceConcurrency deviceConcurrency;
-
-  deviceConcurrency.open();
-
-  int nGpu = deviceConcurrency.getNumGpus();
-
-  // testCase = 0 refers to warmup kernel run
-  int testCase = 0;
-
-  for (int i = 0; i < nGpu; i++) {
-    // Warm-up kernel on all devices
-    deviceConcurrency.run(testCase, 1);
-  }
-
-  // Time for kernel on 1 device
-  deviceConcurrency.run(++testCase, 1);
-
-  // Time for kernel on all available devices
-  deviceConcurrency.run(++testCase, nGpu);
-
-  passed();
-}
@@ -1,432 +0,0 @@
-/*
- Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-#include <hip/hip_vector_types.h>
-
-#ifdef __HIP_PLATFORM_NVIDIA__
-inline __device__ float4 operator*(float s, float4 a)
-{
-  return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
-}
-inline __device__ float4 operator*(float4 a, float4 b)
-{
-  return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-}
-inline __device__ float4 operator+(float4 a, float4 b)
-{
-  return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-}
-inline __device__ float4 operator-(float4 a, float4 b)
-{
-  return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-}
-#endif
-
-typedef struct {
-  double x;
-  double y;
-  double width;
-} coordRec;
-
-static coordRec coords[] = {
-    {0.0, 0.0, 0.00001},  // All black
-};
-
-static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
-
-__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos,
-         float xStep, float yStep, uint maxIter) {
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % (width/4);
-  int j = tid / (width/4);
-  int4 veci = make_int4(4*i, 4*i+1, 4*i+2, 4*i+3);
-  int4 vecj = make_int4(j, j, j, j);
-  float4 x0;
-  x0.x = (float)(xPos + xStep*veci.x);
-  x0.y = (float)(xPos + xStep*veci.y);
-  x0.z = (float)(xPos + xStep*veci.z);
-  x0.w = (float)(xPos + xStep*veci.w);
-  float4 y0;
-  y0.x = (float)(yPos + yStep*vecj.x);
-  y0.y = (float)(yPos + yStep*vecj.y);
-  y0.z = (float)(yPos + yStep*vecj.z);
-  y0.w = (float)(yPos + yStep*vecj.w);
-  float4 x = x0;
-  float4 y = y0;
-  uint iter = 0;
-  float4 tmp;
-  int4 stay;
-  int4 ccount = make_int4(0, 0, 0, 0);
-  float4 savx = x;
-  float4 savy = y;
-  stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f);
-  stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f);
-  stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f);
-  stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f);
-  for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter);
-  iter+=16) {
-    x = savx;
-    y = savy;
-    // Two iterations
-    tmp = x*x + x0 - y*y;
-    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
-    y = 2.0f * tmp * y + y0;
-    // Two iterations
-    tmp = x*x + x0 - y*y;
-    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
-    y = 2.0f * tmp * y + y0;
-    // Two iterations
-    tmp = x*x + x0 - y*y;
-    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
-    y = 2.0f * tmp * y + y0;
-    // Two iterations
-    tmp = x*x + x0 - y*y;
-    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
-    y = 2.0f * tmp * y + y0;
-    // Two iterations
-    tmp = x*x + x0 - y*y;
-    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
-    y = 2.0f * tmp * y + y0;
-    // Two iterations
-    tmp = x*x + x0 - y*y;
-    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
-    y = 2.0f * tmp * y + y0;
-    // Two iterations
-    tmp = x*x + x0 - y*y;
-    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
-    y = 2.0f * tmp * y + y0;
-    stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f);
-    stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f);
-    stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f);
-    stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f);
-    savx.x = (bool)(stay.x ? x.x : savx.x);
-    savx.y = (bool)(stay.y ? x.y : savx.y);
-    savx.z = (bool)(stay.z ? x.z : savx.z);
-    savx.w = (bool)(stay.w ? x.w : savx.w);
-    savy.x = (bool)(stay.x ? y.x : savy.x);
-    savy.y = (bool)(stay.y ? y.y : savy.y);
-    savy.z = (bool)(stay.z ? y.z : savy.z);
-    savy.w = (bool)(stay.w ? y.w : savy.w);
-    ccount.x -= stay.x*16;
-    ccount.y -= stay.y*16;
-    ccount.z -= stay.z*16;
-    ccount.w -= stay.w*16;
-  }
-  // Handle remainder
-  if (!(stay.x & stay.y & stay.z & stay.w))
-  {
-    iter = 16;
-    do
-    {
-      x = savx;
-      y = savy;
-      stay.x = ((x.x*x.x+y.x*y.x) <= 4.0f) && (ccount.x <  maxIter);
-      stay.y = ((x.y*x.y+y.y*y.y) <= 4.0f) && (ccount.y <  maxIter);
-      stay.z = ((x.z*x.z+y.z*y.z) <= 4.0f) && (ccount.z <  maxIter);
-      stay.w = ((x.w*x.w+y.w*y.w) <= 4.0f) && (ccount.w <  maxIter);
-      tmp = x;
-      x = x*x + x0 - y*y;
-      y = 2.0f*tmp*y + y0;
-      ccount.x += stay.x;
-      ccount.y += stay.y;
-      ccount.z += stay.z;
-      ccount.w += stay.w;
-      iter--;
-      savx.x = (stay.x ? x.x : savx.x);
-      savx.y = (stay.y ? x.y : savx.y);
-      savx.z = (stay.z ? x.z : savx.z);
-      savx.w = (stay.w ? x.w : savx.w);
-      savy.x = (stay.x ? y.x : savy.x);
-      savy.y = (stay.y ? y.y : savy.y);
-      savy.z = (stay.z ? y.z : savy.z);
-      savy.w = (stay.w ? y.w : savy.w);
-    } while ((stay.x | stay.y | stay.z | stay.w) && iter);
-  }
-  uint4 *vecOut = (uint4 *)out;
-  vecOut[tid].x = (uint)(ccount.x);
-  vecOut[tid].y = (uint)(ccount.y);
-  vecOut[tid].z = (uint)(ccount.z);
-  vecOut[tid].w = (uint)(ccount.w);
-}
-
-class hipPerfStreamConcurrency {
-  public:
-  hipPerfStreamConcurrency();
-  ~hipPerfStreamConcurrency();
-
-  void setNumKernels(unsigned int num) {
-    numKernels = num;
-  }
-  void setNumStreams(unsigned int num) {
-    numStreams = num;
-  }
-  unsigned int getNumStreams() {
-    return numStreams;
-  }
-
-  unsigned int getNumKernels() {
-    return numKernels;
-  }
-
-  void open(int deviceID);
-  void run(unsigned int testCase, unsigned int deviceId);
-  void close(void);
-
-  private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
-
-  unsigned int numKernels;
-  unsigned int numStreams;
-
-  unsigned int width_;
-  unsigned int bufSize;
-  unsigned int maxIter;
-  unsigned int coordIdx;
-  unsigned long long totalIters;
-  int numCUs;
-
-};
-
-
-hipPerfStreamConcurrency::hipPerfStreamConcurrency() {}
-
-hipPerfStreamConcurrency::~hipPerfStreamConcurrency() {}
-
-void hipPerfStreamConcurrency::open(int deviceId) {
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  if (nGpu < 1) {
-    failed("No GPU!");
-  }
-
-  HIPCHECK(hipSetDevice(deviceId));
-  hipDeviceProp_t props = {0};
-  HIPCHECK(hipGetDeviceProperties(&props, deviceId));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-    << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId  << std::endl;
-
-  numCUs = props.multiProcessorCount;
-}
-
-
-void hipPerfStreamConcurrency::close() {
-}
-
-
-void hipPerfStreamConcurrency::run(unsigned int testCase,unsigned int deviceId) {
-
-  int clkFrequency = 0;
-  unsigned int numStreams = getNumStreams();
-  unsigned int numKernels = getNumKernels();
-
-  HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, deviceId));
-
-  clkFrequency =(unsigned int)clkFrequency/1000;
-
-  // Maximum iteration count
-  // maxIter = 8388608 * (engine_clock / 1000).serial execution
-  maxIter = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs) / 128);
-  maxIter = (maxIter + 15) & ~15;
-
-  hipStream_t streams[numStreams];
-
-  uint * hPtr[numKernels];
-  uint * dPtr[numKernels];
-
-  // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
-  width_ = 256;
-
-  bufSize = width_ * sizeof(uint);
-
-  // Create streams for concurrency
-  for (uint i = 0; i < numStreams; i++) {
-    HIPCHECK(hipStreamCreate(&streams[i]));
-  }
-
-
-  // Allocate memory on the host and device
-  for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
-    setData(hPtr[i], 0xdeadbeef);
-    HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
-  }
-
-
-  // Prepare kernel launch parameters
-  int threads = (bufSize/sizeof(uint));
-  int threads_per_block  = 64;
-  int blocks = (threads/threads_per_block) + (threads % threads_per_block);
-
-  coordIdx = testCase % numCoords;
-  float xStep = (float)(coords[coordIdx].width / (double)width_);
-  float yStep = (float)(-coords[coordIdx].width / (double)width_);
-  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
-  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
-
-  // Copy memory asynchronously and concurrently from host to device
-  for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipMemcpyHtoDAsync(reinterpret_cast<hipDeviceptr_t>(dPtr[i]), hPtr[i], bufSize, streams[i % numStreams]));
-  }
-
-
-  // Synchronize to make sure all the copies are completed
-  for(uint i = 0; i < numStreams; i++) {
-    HIPCHECK(hipStreamSynchronize(streams[i]));
-  }
-
-  // Warm-up kernel with lower iteration
-  if (testCase == 0) {
-    maxIter = 256;
-  }
-
-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-
-  for (uint i = 0; i < numKernels; i++) {
-    hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i%numStreams],
-                      dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter);
-  }
-
-
-  // Synchronize all the concurrent streans to have completed execution
-  for(uint i = 0; i < numStreams; i++) {
-    HIPCHECK(hipStreamSynchronize(streams[i]));
-  }
-
-
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-
-  // Copy data back from device to the host
-  for(uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipMemcpyDtoHAsync(hPtr[i], reinterpret_cast<hipDeviceptr_t>(dPtr[i]), bufSize, streams[i % numStreams]));
-  }
-
-
-  if (testCase != 0) {
-  std::cout <<"Measured time for " << numKernels <<" kernels (s) on " << numStreams <<" stream (s): "
-    << all_kernel_time.count() << std::endl;
-  }
-
-
-  unsigned long long expected =
-    (unsigned long long)width_ * (unsigned long long)maxIter;
-
-  for(uint i = 0 ; i < numStreams; i++) {
-    HIPCHECK(hipStreamDestroy(streams[i]));
-  }
-
-
-  // Free host and device memory
-  for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipHostFree(hPtr[i]));
-    HIPCHECK(hipFree(dPtr[i]));
-  }
-
-
-}
-
-
-void hipPerfStreamConcurrency::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
-  for (unsigned int i = 0; i < width_ ; i++) {
-      ptr2[i] = value;
-  }
-}
-
-
-void hipPerfStreamConcurrency::checkData(uint *ptr) {
-  totalIters = 0;
-  for (unsigned int i = 0; i < width_; i++) {
-    totalIters += ptr[i];
-  }
-}
-
-
-int main(int argc, char* argv[]) {
-  hipPerfStreamConcurrency streamConcurrency;
-  int deviceId = 0;
-
-  streamConcurrency.open(deviceId);
-
-  for (unsigned int testCase = 0; testCase < 5; testCase++) {
-
-
-  switch (testCase) {
-
-
-  case 0:
-    // Warm-up kernel
-    streamConcurrency.setNumStreams(1);
-    streamConcurrency.setNumKernels(1);
-    break;
-
-  case 1:
-  // default stream executes serially
-  streamConcurrency.setNumStreams(1);
-  streamConcurrency.setNumKernels(1);
-  break;
-
-  case 2:
-    // 2-way concurrency
-    streamConcurrency.setNumStreams(2);
-    streamConcurrency.setNumKernels(2);
-    break;
-
-  case 3:
-    // 4-way concurrency
-    streamConcurrency.setNumStreams(4);
-    streamConcurrency.setNumKernels(4);
-    break;
-
-  case 4:
-    streamConcurrency.setNumStreams(2);
-    streamConcurrency.setNumKernels(4);
-    break;
-
-  case 5:
-    break;
-
-  default:
-    break;
-  }
-  streamConcurrency.run(testCase, deviceId);
-
-  }
-
-
-  passed();
-}
@@ -1,131 +0,0 @@
-/*
- Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-
-using namespace std;
-
-#define BufSize 0x1000
-#define Iterations 0x100
-#define TotalStreams 4
-#define TotalBufs 4
-
-
-class hipPerfStreamCreateCopyDestroy {
-  private:
-    unsigned int numBuffers_;
-    unsigned int numStreams_;
-    const size_t totalStreams_[TotalStreams];
-    const size_t totalBuffers_[TotalBufs];
-  public:
-    hipPerfStreamCreateCopyDestroy() : numBuffers_(0), numStreams_(0),
-                                       totalStreams_{1, 2, 4, 8},
-                                       totalBuffers_{1, 100, 1000, 5000} {};
-    ~hipPerfStreamCreateCopyDestroy() {};
-    void open(int deviceID);
-    void run(unsigned int testNumber);
-};
-
-void hipPerfStreamCreateCopyDestroy::open(int deviceId) {
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  if (nGpu < 1) {
-    failed("No GPU!");
-  }
-
-  HIPCHECK(hipSetDevice(deviceId));
-  hipDeviceProp_t props = {0};
-  HIPCHECK(hipGetDeviceProperties(&props, deviceId));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-    << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId  << std::endl;
-}
-
-void hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) {
-  numStreams_ = totalStreams_[testNumber % TotalStreams];
-  size_t iter = Iterations / (numStreams_ * ((size_t)1 << (testNumber / TotalBufs + 1)));
-  hipStream_t streams[numStreams_];
-
-  numBuffers_ = totalBuffers_[testNumber / TotalBufs];
-  float* dSrc[numBuffers_];
-  size_t nBytes = BufSize * sizeof(float);
-
-  for (size_t b = 0; b < numBuffers_; ++b) {
-    HIPCHECK(hipMalloc(&dSrc[b], nBytes));
-  }
-
-  float* hSrc;
-  hSrc = new float[nBytes];
-  HIPCHECK(hSrc == 0 ? hipErrorOutOfMemory : hipSuccess);
-  for (size_t i = 0; i < BufSize; i++) {
-    hSrc[i] = 1.618f + i;
-  }
-
-  auto start = std::chrono::steady_clock::now();
-
-  for (size_t i = 0; i < iter; ++i) {
-    for (size_t s = 0; s < numStreams_; ++s) {
-      HIPCHECK(hipStreamCreate(&streams[s]));
-    }
-
-    for (size_t s = 0; s < numStreams_; ++s) {
-      for (size_t b = 0; b < numBuffers_; ++b) {
-        HIPCHECK(hipMemcpyWithStream(dSrc[b], hSrc, nBytes, hipMemcpyHostToDevice, streams[s]));
-      }
-    }
-
-    for (size_t s = 0; s < numStreams_; ++s) {
-      HIPCHECK(hipStreamDestroy(streams[s]));
-    }
-  }
-
-  auto end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> diff = end - start;
-
-  auto time = static_cast<float>(diff.count() * 1000 / (iter * numStreams_));
-
-  cout << "Create+Copy+Destroy time for " << numStreams_ << " streams and "
-       << setw(4) << numBuffers_ << " buffers " << " and " << setw(4)
-       << iter << " iterations " << time << " (ms) " << endl;
-
-  delete [] hSrc;
-  for (size_t b = 0; b < numBuffers_; ++b) {
-    HIPCHECK(hipFree(dSrc[b]));
-  }
-}
-
-int main(int argc, char* argv[]) {
-  hipPerfStreamCreateCopyDestroy streamCCD;
-
-  int deviceId = 0;
-  streamCCD.open(deviceId);
-
-  for (auto testCase = 0; testCase < TotalStreams * TotalBufs; testCase++) {
-    streamCCD.run(testCase);
-  }
-
-  passed();
-}