From a17c4dce84aacee59bcbefe6cf3b0eef6b54a6e4 Mon Sep 17 00:00:00 2001
From: Sourabh Betigeri <sourabh.betigeri@amd.com>
Date: Fri, 4 Sep 2020 13:14:29 -0700
Subject: [PATCH] SWDEV-248163: Adding new Mandelbrot's tests - Serial vs
 concurrent kernel execution and mad vs fma performance results

Change-Id: I5118d13b40e2d17414124eebfd1b5114641db978
---
 .../performance/compute/hipPerfMandelbrot.cpp | 747 ++++++++++++++++++
 1 file changed, 747 insertions(+)
 create mode 100644 hipamd/tests/performance/compute/hipPerfMandelbrot.cpp
diff --git a/hipamd/tests/performance/compute/hipPerfMandelbrot.cpp b/hipamd/tests/performance/compute/hipPerfMandelbrot.cpp
new file mode 100644
index 0000000000..c4234d8c37
--- /dev/null
+++ b/hipamd/tests/performance/compute/hipPerfMandelbrot.cpp
@@ -0,0 +1,747 @@
+/*
+ Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved.
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ */
+
+/* HIT_START
+ * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <iostream>
+#include <chrono>
+#include "test_common.h"
+#include <hip/hip_vector_types.h>
+#include <hip/math_functions.h>
+#include <vector>
+#include <string>
+#include <map>
+
+typedef struct {
+  double x;
+  double y;
+  double width;
+} coordRec;
+
+coordRec coords[] = {
+    {0.0, 0.0, 4.0},                                     // Whole set
+    {0.0, 0.0, 0.00001},                                 // All black
+    {-0.0180789661868, 0.6424294066162, 0.00003824140},  // Hit detail
+};
+
+static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
+
+template <typename T>
+__global__ void float_mad_kernel(uint *out, uint width, T xPos,  T yPos, T xStep, T yStep,
+                                  uint maxIter) {
+
+#pragma FP_CONTRACT ON
+  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
+  int i = tid % width;
+  int j = tid / width;
+  float x0 = (float)(xPos + xStep*i);
+  float y0 = (float)(yPos + yStep*j);
+
+  float x = x0;
+  float y = y0;
+
+  uint iter = 0;
+  float tmp;
+  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
+    tmp = x;
+    x = fma(-y,y,fma(x,x,x0));
+    y = fma(2.0f*tmp,y,y0);
+  }
+
+  out[tid] = iter;
+};
+
+template <typename T>
+__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
+    T yPos, T xStep, T yStep, uint maxIter) {
+
+#pragma FP_CONTRACT ON
+  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
+  int i = tid % width;
+  int j = tid / width;
+  float x0 = (float)(xPos + xStep*(float)i);
+  float y0 = (float)(yPos + yStep*(float)j);
+
+  float x = x0;
+  float y = y0;
+
+#define FAST
+  uint iter = 0;
+  float tmp;
+  int stay;
+  int ccount = 0;
+  stay = (x*x+y*y) <= 4.0;
+  float savx = x;
+  float savy = y;
+#ifdef FAST
+  for (iter = 0; (iter < maxIter); iter+=16) {
+#else
+  for (iter = 0; stay && (iter < maxIter); iter+=16) {
+#endif
+    x = savx;
+    y = savy;
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    stay = (x*x+y*y) <= 4.0;
+    savx = (stay ? x : savx);
+    savy = (stay ? y : savy);
+    ccount += stay*16;
+#ifdef FAST
+    if (!stay)
+      break;
+#endif
+  }
+  // Handle remainder
+  if (!stay) {
+    iter = 16;
+    do {
+      x = savx;
+      y = savy;
+      stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
+      tmp = x;
+      x =  fma(-y,y, fma(x,x,x0));
+      y =  fma(2.0f*tmp,y,y0);
+      ccount += stay;
+      iter--;
+      savx = (stay ? x : savx);
+      savy = (stay ? y : savy);
+    } while (stay && iter);
+  }
+
+
+  out[tid] = (uint)ccount;
+
+};
+
+
+template <typename T>
+__global__ void double_mad_kernel(uint *out, uint width, T xPos,  T yPos, T xStep, T yStep,
+                                   uint maxIter) {
+
+#pragma FP_CONTRACT ON
+  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
+  int i = tid % width;
+  int j = tid / width;
+  double x0 = (double)(xPos + xStep*i);
+  double y0 = (double)(yPos + yStep*j);
+
+  double x = x0;
+  double y = y0;
+
+  uint iter = 0;
+  double tmp;
+  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
+    tmp = x;
+    x = fma(-y,y,fma(x,x,x0));
+    y = fma(2.0f*tmp,y,y0);
+  }
+  out[tid] = iter;
+};
+
+
+template <typename T>
+__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
+                  T yPos, T xStep, T yStep, uint maxIter) {
+
+#pragma FP_CONTRACT ON
+  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
+
+  int i = tid % width;
+  int j = tid / width;
+  double x0 = (double)(xPos + xStep*(double)i);
+  double y0 = (double)(yPos + yStep*(double)j);
+
+  double x = x0;
+  double y = y0;
+
+#define FAST
+  uint iter = 0;
+  double tmp;
+  int stay;
+  int ccount = 0;
+  stay = (x*x+y*y) <= 4.0;
+  double savx = x;
+  double savy = y;
+#ifdef FAST
+  for (iter = 0; (iter < maxIter); iter+=16)
+#else
+  for (iter = 0; stay && (iter < maxIter); iter+=16)
+#endif
+  {
+    x = savx;
+    y = savy;
+
+    // Two iterations
+    tmp = fma(-y,y, fma(x,x,x0));
+    y =   fma(2.0f*x,y,y0);
+    x =   fma(-y,y, fma(tmp,tmp,x0));
+    y =   fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp = fma(-y,y, fma(x,x,x0));
+    y =   fma(2.0f*x,y,y0);
+    x =   fma(-y,y, fma(tmp,tmp,x0));
+    y =   fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp = fma(-y,y, fma(x,x,x0));
+    y =   fma(2.0f*x,y,y0);
+    x =   fma(-y,y, fma(tmp,tmp,x0));
+    y =   fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =    fma(2.0f*x,y,y0);
+    x =    fma(-y,y, fma(tmp,tmp,x0));
+    y =    fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =    fma(2.0f*x,y,y0);
+    x =    fma(-y,y, fma(tmp,tmp,x0));
+    y =    fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =    fma(2.0f*x,y,y0);
+    x =    fma(-y,y, fma(tmp,tmp,x0));
+    y =    fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =    fma(2.0f*x,y,y0);
+    x =    fma(-y,y, fma(tmp,tmp,x0));
+    y =    fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =    fma(2.0f*x,y,y0);
+    x =    fma(-y,y, fma(tmp,tmp,x0));
+    y =    fma(2.0f*tmp,y,y0);
+
+    stay = (x*x+y*y) <= 4.0;
+    savx = (stay ? x : savx);
+    savy = (stay ? y : savy);
+    ccount += stay*16;
+#ifdef FAST
+    if (!stay)
+      break;
+#endif
+    }
+  // Handle remainder
+    if (!stay) {
+      iter = 16;
+      do {
+        x = savx;
+        y = savy;
+        stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
+        tmp = x;
+        x =  fma(-y,y, fma(x,x,x0));
+        y =  fma(2.0f*tmp,y,y0);
+        ccount += stay;
+        iter--;
+        savx = (stay ? x : savx);
+        savy = (stay ? y : savy);
+      }
+      while (stay && iter);
+
+    }
+    out[tid] = (uint)ccount;
+};
+
+static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15;
+
+// Expected results for each kernel run at each coord
+unsigned long long expectedIters[] = {
+    203277748ull,  2147483648ull, 120254651ull,  203277748ull,  2147483648ull,
+    120254651ull,  203277748ull,  2147483648ull, 120254651ull,  203315114ull,
+    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull,
+    203280620ull,  2147483648ull, 120485704ull,  203280620ull,  2147483648ull,
+    120485704ull,  203280620ull,  2147483648ull, 120485704ull,  203315114ull,
+    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull};
+
+class hipPerfMandelBrot {
+  public:
+  hipPerfMandelBrot();
+  ~hipPerfMandelBrot();
+
+  void setNumKernels(unsigned int num) {
+    numKernels = num;
+  }
+
+  unsigned int getNumKernels() {
+    return numKernels;
+  }
+
+  void setNumStreams(unsigned int num) {
+    numStreams = num;
+  }
+  unsigned int getNumStreams() {
+    return numStreams;
+  }
+
+  void open(int deviceID);
+  void run(unsigned int testCase, unsigned int deviceId);
+  void printResults(void);
+
+  // array of funtion pointers
+  typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos,  float yPos,
+                 float xStep, float yStep, uint maxIter,  hipStream_t* streams, int blocks,
+                 int threads_per_block, int kernelCnt);
+
+  // Wrappers
+  void float_mad(uint *out, uint width, float xPos,  float yPos,
+                  float xStep, float yStep, uint maxIter, hipStream_t* streams,
+                  int blocks, int threads_per_block, int kernelCnt);
+
+  void float_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
+                            float xStep, float yStep, uint maxIter, hipStream_t* streams,
+                            int blocks, int threads_per_block, int kernelCnt);
+
+  void double_mad(uint *out, uint width, float xPos,  float yPos, float xStep,
+                   float yStep, uint maxIter, hipStream_t* streams, int blocks,
+                   int threads_per_block, int kernelCnt);
+
+  void double_mandel_unroll(uint *out, uint width, float xPos,  float yPos, float xStep,
+                             float yStep, uint maxIter, hipStream_t* streams, int blocks,
+                             int threads_per_block, int kernelCnt);
+
+  hipStream_t streams[2];
+
+  private:
+  void setData(void *ptr, unsigned int value);
+  void checkData(uint *ptr);
+
+  unsigned int numKernels;
+  unsigned int numStreams;
+
+  std::map<std::string, std::vector<double>> results;
+  unsigned int width_;
+  unsigned int bufSize;
+  unsigned int maxIter;
+  unsigned int coordIdx;
+  volatile unsigned long long totalIters = 0;
+  int numCUs;
+  static const unsigned int numLoops = 10;
+};
+
+
+hipPerfMandelBrot::hipPerfMandelBrot() {}
+
+hipPerfMandelBrot::~hipPerfMandelBrot() {}
+
+void hipPerfMandelBrot::open(int deviceId) {
+
+
+  int nGpu = 0;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+  if (nGpu < 1) {
+  std::cout << "info: didn't find any GPU! skipping the test!\n";
+  passed();
+  return;
+  }
+
+
+  HIPCHECK(hipSetDevice(deviceId));
+  hipDeviceProp_t props = {0};
+  HIPCHECK(hipGetDeviceProperties(&props, deviceId));
+  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
+    << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
+    << std::endl;
+
+  numCUs = props.multiProcessorCount;
+}
+
+
+void hipPerfMandelBrot::printResults() {
+
+  int numkernels = getNumKernels();
+  int numStreams = getNumStreams();
+
+  std::cout << "\n" <<"Measured perf for kernels in GFLOPS on "
+            << numStreams << " streams (s)" <<  std::endl;
+
+  std::map<std::string, std::vector<double>>:: iterator itr;
+  for (itr = results.begin(); itr != results.end(); itr++) {
+          std::cout << "\n" << std::setw(20) << itr->first << " ";
+          for(auto i : results[itr->first]) {
+            std::cout << std::setw(10) << i << " ";
+            }
+     }
+  results.clear();
+
+  std::cout << std::endl;
+}
+
+
+// Wrappers for the kernel launches
+void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos,  float yPos, float xStep,
+                                   float yStep, uint maxIter, hipStream_t* streams,
+                                   int blocks, int threads_per_block, int kernelCnt) {
+
+  int streamCnt = getNumStreams();
+  hipLaunchKernelGGL(float_mad_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
+                      streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep,
+                      maxIter);
+
+
+}
+
+
+void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
+                             float xStep, float yStep, uint maxIter, hipStream_t * streams,
+                             int blocks, int threads_per_block, int kernelCnt) {
+
+  int streamCnt = getNumStreams();
+  hipLaunchKernelGGL(float_mandel_unroll_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
+                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
+
+}
+
+
+void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos,  float yPos,
+                               float xStep, float yStep, uint maxIter, hipStream_t * streams,
+                               int blocks, int threads_per_block, int kernelCnt) {
+
+  int streamCnt = getNumStreams();
+  hipLaunchKernelGGL(double_mad_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
+                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
+
+}
+
+
+void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
+                              float xStep, float yStep, uint maxIter, hipStream_t * streams,
+                              int blocks, int threads_per_block, int kernelCnt) {
+
+  int streamCnt = getNumStreams();
+  hipLaunchKernelGGL(float_mandel_unroll_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
+                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
+
+}
+
+
+void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
+
+  unsigned int numStreams = getNumStreams();
+
+  funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
+               &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
+
+  // Maximum iteration count
+  maxIter = 32768;
+
+  uint * hPtr[numKernels];
+  uint * dPtr[numKernels];
+
+  // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
+  width_ = 256;
+
+  bufSize = width_  * width_ * sizeof(uint);
+
+  // Create streams for concurrency
+  for (uint i = 0; i < numStreams; i++) {
+    HIPCHECK(hipStreamCreate(&streams[i]));
+  }
+
+
+  // Allocate memory on the host and device
+  for (uint i = 0; i < numKernels; i++) {
+    HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
+    setData(hPtr[i], 0xdeadbeef);
+    HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
+  }
+
+
+  // Prepare kernel launch parameters
+  int threads = (bufSize/sizeof(uint));
+  int threads_per_block  = 64;
+  int blocks = (threads/threads_per_block) + (threads % threads_per_block);
+
+  float xStep = (float)(coords[coordIdx].width / (double)width_);
+  float yStep = (float)(-coords[coordIdx].width / (double)width_);
+  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
+  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
+
+  // Copy memory asynchronously and concurrently from host to device
+  for (uint i = 0; i < numKernels; i++) {
+    HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
+  }
+
+  // Synchronize to make sure all the copies are completed
+  HIPCHECK(hipStreamSynchronize(0));
+
+  int kernelIdx;
+  if(testCase == 0 || testCase == 5 || testCase == 10) {
+    kernelIdx = 0;
+  }
+
+  else if(testCase == 1 || testCase == 6 || testCase == 11) {
+    kernelIdx = 1;
+  }
+  else if(testCase == 2 || testCase == 7 || testCase == 12) {
+    kernelIdx = 2;
+  }
+  else if(testCase == 3 || testCase == 8 || testCase == 13){
+    kernelIdx = 3;
+  }
+
+
+  double totalTime = 0.0;
+
+  for (unsigned int k = 0; k < numLoops; k++) {
+
+  coordIdx = testCase % numCoords;
+
+  if ((testCase == 0 || testCase == 1 || testCase == 2 ||
+                  testCase == 5 || testCase == 6 || testCase == 7 ||
+                  testCase == 10 || testCase == 11 || testCase == 12)) {
+  float xStep = (float)(coords[coordIdx].width / (double)width_);
+  float yStep = (float)(-coords[coordIdx].width / (double)width_);
+  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
+  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
+
+  // Time the kernel execution
+  auto all_start = std::chrono::steady_clock::now();
+
+  for (uint i = 0; i < numKernels; i++) {
+    (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
+                           threads_per_block, i);
+  }
+
+
+  // Synchronize all the concurrent streams to have completed execution
+  HIPCHECK(hipStreamSynchronize(0));
+
+  auto all_end = std::chrono::steady_clock::now();
+  std::chrono::duration<double> all_kernel_time = all_end - all_start;
+  totalTime += all_kernel_time.count();
+
+  }
+
+
+  else {
+  double xStep = coords[coordIdx].width / (double)width_;
+  double yStep = -coords[coordIdx].width / (double)width_;
+  double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
+  double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
+
+  // Time the kernel execution
+  auto all_start = std::chrono::steady_clock::now();
+
+  for (uint i = 0; i < numKernels; i++) {
+  (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
+                         threads_per_block, i);
+  }
+
+
+  // Synchronize all the concurrent streams to have completed execution
+  HIPCHECK(hipStreamSynchronize(0));
+
+  auto all_end = std::chrono::steady_clock::now();
+  std::chrono::duration<double> all_kernel_time = all_end - all_start;
+  totalTime += all_kernel_time.count();
+  }
+
+
+  }
+
+  // Copy data back from device to the host
+  for(uint i = 0; i < numKernels; i++) {
+    HIPCHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost));
+  }
+
+
+  for(uint i = 0; i < numKernels; i++) {
+  checkData(hPtr[i]);
+
+  int j =0;
+  while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
+          j++;
+  }
+
+  if(j==30) {
+    std::cout << "Incorrect iteration count detected. ";
+  }
+
+  }
+
+
+  // Compute GFLOPS.  There are 7 FLOPs per iteration
+  double perf = ((double)(totalIters*numKernels) * 7 * (double)(1e-09)) /
+                (totalTime / (double)numLoops);
+
+
+  std::vector<std::string> kernelName = {"float", "float_unroll",
+                      "double", "double_unroll"};
+
+  // Print results except for Warm-up kernel
+  if(testCase!=100) {
+  results[kernelName[testCase % 4]].push_back(perf);
+ }
+
+
+  for(uint i = 0 ; i < numStreams; i++) {
+    HIPCHECK(hipStreamDestroy(streams[i]));
+  }
+
+
+  // Free host and device memory
+  for (uint i = 0; i < numKernels; i++) {
+    HIPCHECK(hipFree(hPtr[i]));
+    HIPCHECK(hipFree(dPtr[i]));
+  }
+
+
+}
+
+
+void hipPerfMandelBrot::setData(void *ptr, unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  for (unsigned int i = 0; i < width_ * width_; i++) {
+      ptr2[i] = value;
+  }
+}
+
+
+void hipPerfMandelBrot::checkData(uint *ptr) {
+  totalIters = 0;
+  for (unsigned int i = 0; i < width_ * width_; i++) {
+    totalIters += ptr[i];
+  }
+}
+
+
+int main(int argc, char* argv[]) {
+  hipPerfMandelBrot mandelbrotCompute;
+  int deviceId = 0;
+
+  mandelbrotCompute.open(deviceId);
+
+  for (unsigned int testCase = 0; testCase < 3; testCase++) {
+
+
+  switch (testCase) {
+
+
+  case 0: {
+    // Warmup-kernel - default stream executes serially
+    mandelbrotCompute.setNumStreams(1);
+    mandelbrotCompute.setNumKernels(1);
+    mandelbrotCompute.run(100/*Random number*/, deviceId);
+    break;
+    }
+
+
+  case 1: {
+    // run all - sync
+    int i = 0;
+    do {
+    mandelbrotCompute.setNumStreams(1);
+    mandelbrotCompute.setNumKernels(1);
+    mandelbrotCompute.run(i, deviceId);
+    i++;
+    }while(i < 12);
+    mandelbrotCompute.printResults();
+
+    break;
+  }
+
+
+  case 2: {
+    // run all - async
+    int i = 0;
+    do {
+    mandelbrotCompute.setNumStreams(2);
+    mandelbrotCompute.setNumKernels(2);
+    mandelbrotCompute.run(i, deviceId);
+    i++;
+    }while(i < 12);
+    mandelbrotCompute.printResults();
+
+    break;
+
+  }
+
+
+  default: {
+    break;
+  }
+
+
+  }
+
+
+
+  }
+
+
+  passed();
+}