Revert "Revert "Merge branch 'amd-master-next' into amd-npi-next""

This reverts commit 5bc903a0cd. Reason for revert: <INSERT REASONING HERE> Change-Id: I92ceb171e31026ed1864704cef2fc1497b883ef9 [ROCm/hip-tests commit: 08c35854c0]
2020-10-05 13:20:58 -04:00
@@ -0,0 +1,747 @@
+/*
+ Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved.
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ */
+
+/* HIT_START
+ * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <iostream>
+#include <chrono>
+#include "test_common.h"
+#include <hip/hip_vector_types.h>
+#include <hip/math_functions.h>
+#include <vector>
+#include <string>
+#include <map>
+
+typedef struct {
+  double x;
+  double y;
+  double width;
+} coordRec;
+
+coordRec coords[] = {
+    {0.0, 0.0, 4.0},                                     // Whole set
+    {0.0, 0.0, 0.00001},                                 // All black
+    {-0.0180789661868, 0.6424294066162, 0.00003824140},  // Hit detail
+};
+
+static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
+
+template <typename T>
+__global__ void float_mad_kernel(uint *out, uint width, T xPos,  T yPos, T xStep, T yStep,
+                                  uint maxIter) {
+
+#pragma FP_CONTRACT ON
+  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
+  int i = tid % width;
+  int j = tid / width;
+  float x0 = (float)(xPos + xStep*i);
+  float y0 = (float)(yPos + yStep*j);
+
+  float x = x0;
+  float y = y0;
+
+  uint iter = 0;
+  float tmp;
+  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
+    tmp = x;
+    x = fma(-y,y,fma(x,x,x0));
+    y = fma(2.0f*tmp,y,y0);
+  }
+
+  out[tid] = iter;
+};
+
+template <typename T>
+__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
+    T yPos, T xStep, T yStep, uint maxIter) {
+
+#pragma FP_CONTRACT ON
+  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
+  int i = tid % width;
+  int j = tid / width;
+  float x0 = (float)(xPos + xStep*(float)i);
+  float y0 = (float)(yPos + yStep*(float)j);
+
+  float x = x0;
+  float y = y0;
+
+#define FAST
+  uint iter = 0;
+  float tmp;
+  int stay;
+  int ccount = 0;
+  stay = (x*x+y*y) <= 4.0;
+  float savx = x;
+  float savy = y;
+#ifdef FAST
+  for (iter = 0; (iter < maxIter); iter+=16) {
+#else
+  for (iter = 0; stay && (iter < maxIter); iter+=16) {
+#endif
+    x = savx;
+    y = savy;
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    stay = (x*x+y*y) <= 4.0;
+    savx = (stay ? x : savx);
+    savy = (stay ? y : savy);
+    ccount += stay*16;
+#ifdef FAST
+    if (!stay)
+      break;
+#endif
+  }
+  // Handle remainder
+  if (!stay) {
+    iter = 16;
+    do {
+      x = savx;
+      y = savy;
+      stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
+      tmp = x;
+      x =  fma(-y,y, fma(x,x,x0));
+      y =  fma(2.0f*tmp,y,y0);
+      ccount += stay;
+      iter--;
+      savx = (stay ? x : savx);
+      savy = (stay ? y : savy);
+    } while (stay && iter);
+  }
+
+
+  out[tid] = (uint)ccount;
+
+};
+
+
+template <typename T>
+__global__ void double_mad_kernel(uint *out, uint width, T xPos,  T yPos, T xStep, T yStep,
+                                   uint maxIter) {
+
+#pragma FP_CONTRACT ON
+  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
+  int i = tid % width;
+  int j = tid / width;
+  double x0 = (double)(xPos + xStep*i);
+  double y0 = (double)(yPos + yStep*j);
+
+  double x = x0;
+  double y = y0;
+
+  uint iter = 0;
+  double tmp;
+  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
+    tmp = x;
+    x = fma(-y,y,fma(x,x,x0));
+    y = fma(2.0f*tmp,y,y0);
+  }
+  out[tid] = iter;
+};
+
+
+template <typename T>
+__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
+                  T yPos, T xStep, T yStep, uint maxIter) {
+
+#pragma FP_CONTRACT ON
+  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
+
+  int i = tid % width;
+  int j = tid / width;
+  double x0 = (double)(xPos + xStep*(double)i);
+  double y0 = (double)(yPos + yStep*(double)j);
+
+  double x = x0;
+  double y = y0;
+
+#define FAST
+  uint iter = 0;
+  double tmp;
+  int stay;
+  int ccount = 0;
+  stay = (x*x+y*y) <= 4.0;
+  double savx = x;
+  double savy = y;
+#ifdef FAST
+  for (iter = 0; (iter < maxIter); iter+=16)
+#else
+  for (iter = 0; stay && (iter < maxIter); iter+=16)
+#endif
+  {
+    x = savx;
+    y = savy;
+
+    // Two iterations
+    tmp = fma(-y,y, fma(x,x,x0));
+    y =   fma(2.0f*x,y,y0);
+    x =   fma(-y,y, fma(tmp,tmp,x0));
+    y =   fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp = fma(-y,y, fma(x,x,x0));
+    y =   fma(2.0f*x,y,y0);
+    x =   fma(-y,y, fma(tmp,tmp,x0));
+    y =   fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp = fma(-y,y, fma(x,x,x0));
+    y =   fma(2.0f*x,y,y0);
+    x =   fma(-y,y, fma(tmp,tmp,x0));
+    y =   fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =    fma(2.0f*x,y,y0);
+    x =    fma(-y,y, fma(tmp,tmp,x0));
+    y =    fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =    fma(2.0f*x,y,y0);
+    x =    fma(-y,y, fma(tmp,tmp,x0));
+    y =    fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =    fma(2.0f*x,y,y0);
+    x =    fma(-y,y, fma(tmp,tmp,x0));
+    y =    fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =    fma(2.0f*x,y,y0);
+    x =    fma(-y,y, fma(tmp,tmp,x0));
+    y =    fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =    fma(2.0f*x,y,y0);
+    x =    fma(-y,y, fma(tmp,tmp,x0));
+    y =    fma(2.0f*tmp,y,y0);
+
+    stay = (x*x+y*y) <= 4.0;
+    savx = (stay ? x : savx);
+    savy = (stay ? y : savy);
+    ccount += stay*16;
+#ifdef FAST
+    if (!stay)
+      break;
+#endif
+    }
+  // Handle remainder
+    if (!stay) {
+      iter = 16;
+      do {
+        x = savx;
+        y = savy;
+        stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
+        tmp = x;
+        x =  fma(-y,y, fma(x,x,x0));
+        y =  fma(2.0f*tmp,y,y0);
+        ccount += stay;
+        iter--;
+        savx = (stay ? x : savx);
+        savy = (stay ? y : savy);
+      }
+      while (stay && iter);
+
+    }
+    out[tid] = (uint)ccount;
+};
+
+static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15;
+
+// Expected results for each kernel run at each coord
+unsigned long long expectedIters[] = {
+    203277748ull,  2147483648ull, 120254651ull,  203277748ull,  2147483648ull,
+    120254651ull,  203277748ull,  2147483648ull, 120254651ull,  203315114ull,
+    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull,
+    203280620ull,  2147483648ull, 120485704ull,  203280620ull,  2147483648ull,
+    120485704ull,  203280620ull,  2147483648ull, 120485704ull,  203315114ull,
+    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull};
+
+class hipPerfMandelBrot {
+  public:
+  hipPerfMandelBrot();
+  ~hipPerfMandelBrot();
+
+  void setNumKernels(unsigned int num) {
+    numKernels = num;
+  }
+
+  unsigned int getNumKernels() {
+    return numKernels;
+  }
+
+  void setNumStreams(unsigned int num) {
+    numStreams = num;
+  }
+  unsigned int getNumStreams() {
+    return numStreams;
+  }
+
+  void open(int deviceID);
+  void run(unsigned int testCase, unsigned int deviceId);
+  void printResults(void);
+
+  // array of funtion pointers
+  typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos,  float yPos,
+                 float xStep, float yStep, uint maxIter,  hipStream_t* streams, int blocks,
+                 int threads_per_block, int kernelCnt);
+
+  // Wrappers
+  void float_mad(uint *out, uint width, float xPos,  float yPos,
+                  float xStep, float yStep, uint maxIter, hipStream_t* streams,
+                  int blocks, int threads_per_block, int kernelCnt);
+
+  void float_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
+                            float xStep, float yStep, uint maxIter, hipStream_t* streams,
+                            int blocks, int threads_per_block, int kernelCnt);
+
+  void double_mad(uint *out, uint width, float xPos,  float yPos, float xStep,
+                   float yStep, uint maxIter, hipStream_t* streams, int blocks,
+                   int threads_per_block, int kernelCnt);
+
+  void double_mandel_unroll(uint *out, uint width, float xPos,  float yPos, float xStep,
+                             float yStep, uint maxIter, hipStream_t* streams, int blocks,
+                             int threads_per_block, int kernelCnt);
+
+  hipStream_t streams[2];
+
+  private:
+  void setData(void *ptr, unsigned int value);
+  void checkData(uint *ptr);
+
+  unsigned int numKernels;
+  unsigned int numStreams;
+
+  std::map<std::string, std::vector<double>> results;
+  unsigned int width_;
+  unsigned int bufSize;
+  unsigned int maxIter;
+  unsigned int coordIdx;
+  volatile unsigned long long totalIters = 0;
+  int numCUs;
+  static const unsigned int numLoops = 10;
+};
+
+
+hipPerfMandelBrot::hipPerfMandelBrot() {}
+
+hipPerfMandelBrot::~hipPerfMandelBrot() {}
+
+void hipPerfMandelBrot::open(int deviceId) {
+
+
+  int nGpu = 0;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+  if (nGpu < 1) {
+  std::cout << "info: didn't find any GPU! skipping the test!\n";
+  passed();
+  return;
+  }
+
+
+  HIPCHECK(hipSetDevice(deviceId));
+  hipDeviceProp_t props = {0};
+  HIPCHECK(hipGetDeviceProperties(&props, deviceId));
+  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
+    << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
+    << std::endl;
+
+  numCUs = props.multiProcessorCount;
+}
+
+
+void hipPerfMandelBrot::printResults() {
+
+  int numkernels = getNumKernels();
+  int numStreams = getNumStreams();
+
+  std::cout << "\n" <<"Measured perf for kernels in GFLOPS on "
+            << numStreams << " streams (s)" <<  std::endl;
+
+  std::map<std::string, std::vector<double>>:: iterator itr;
+  for (itr = results.begin(); itr != results.end(); itr++) {
+          std::cout << "\n" << std::setw(20) << itr->first << " ";
+          for(auto i : results[itr->first]) {
+            std::cout << std::setw(10) << i << " ";
+            }
+     }
+  results.clear();
+
+  std::cout << std::endl;
+}
+
+
+// Wrappers for the kernel launches
+void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos,  float yPos, float xStep,
+                                   float yStep, uint maxIter, hipStream_t* streams,
+                                   int blocks, int threads_per_block, int kernelCnt) {
+
+  int streamCnt = getNumStreams();
+  hipLaunchKernelGGL(float_mad_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
+                      streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep,
+                      maxIter);
+
+
+}
+
+
+void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
+                             float xStep, float yStep, uint maxIter, hipStream_t * streams,
+                             int blocks, int threads_per_block, int kernelCnt) {
+
+  int streamCnt = getNumStreams();
+  hipLaunchKernelGGL(float_mandel_unroll_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
+                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
+
+}
+
+
+void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos,  float yPos,
+                               float xStep, float yStep, uint maxIter, hipStream_t * streams,
+                               int blocks, int threads_per_block, int kernelCnt) {
+
+  int streamCnt = getNumStreams();
+  hipLaunchKernelGGL(double_mad_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
+                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
+
+}
+
+
+void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
+                              float xStep, float yStep, uint maxIter, hipStream_t * streams,
+                              int blocks, int threads_per_block, int kernelCnt) {
+
+  int streamCnt = getNumStreams();
+  hipLaunchKernelGGL(float_mandel_unroll_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
+                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
+
+}
+
+
+void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
+
+  unsigned int numStreams = getNumStreams();
+
+  funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
+               &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
+
+  // Maximum iteration count
+  maxIter = 32768;
+
+  uint * hPtr[numKernels];
+  uint * dPtr[numKernels];
+
+  // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
+  width_ = 256;
+
+  bufSize = width_  * width_ * sizeof(uint);
+
+  // Create streams for concurrency
+  for (uint i = 0; i < numStreams; i++) {
+    HIPCHECK(hipStreamCreate(&streams[i]));
+  }
+
+
+  // Allocate memory on the host and device
+  for (uint i = 0; i < numKernels; i++) {
+    HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
+    setData(hPtr[i], 0xdeadbeef);
+    HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
+  }
+
+
+  // Prepare kernel launch parameters
+  int threads = (bufSize/sizeof(uint));
+  int threads_per_block  = 64;
+  int blocks = (threads/threads_per_block) + (threads % threads_per_block);
+
+  float xStep = (float)(coords[coordIdx].width / (double)width_);
+  float yStep = (float)(-coords[coordIdx].width / (double)width_);
+  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
+  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
+
+  // Copy memory asynchronously and concurrently from host to device
+  for (uint i = 0; i < numKernels; i++) {
+    HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
+  }
+
+  // Synchronize to make sure all the copies are completed
+  HIPCHECK(hipStreamSynchronize(0));
+
+  int kernelIdx;
+  if(testCase == 0 || testCase == 5 || testCase == 10) {
+    kernelIdx = 0;
+  }
+
+  else if(testCase == 1 || testCase == 6 || testCase == 11) {
+    kernelIdx = 1;
+  }
+  else if(testCase == 2 || testCase == 7 || testCase == 12) {
+    kernelIdx = 2;
+  }
+  else if(testCase == 3 || testCase == 8 || testCase == 13){
+    kernelIdx = 3;
+  }
+
+
+  double totalTime = 0.0;
+
+  for (unsigned int k = 0; k < numLoops; k++) {
+
+  coordIdx = testCase % numCoords;
+
+  if ((testCase == 0 || testCase == 1 || testCase == 2 ||
+                  testCase == 5 || testCase == 6 || testCase == 7 ||
+                  testCase == 10 || testCase == 11 || testCase == 12)) {
+  float xStep = (float)(coords[coordIdx].width / (double)width_);
+  float yStep = (float)(-coords[coordIdx].width / (double)width_);
+  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
+  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
+
+  // Time the kernel execution
+  auto all_start = std::chrono::steady_clock::now();
+
+  for (uint i = 0; i < numKernels; i++) {
+    (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
+                           threads_per_block, i);
+  }
+
+
+  // Synchronize all the concurrent streams to have completed execution
+  HIPCHECK(hipStreamSynchronize(0));
+
+  auto all_end = std::chrono::steady_clock::now();
+  std::chrono::duration<double> all_kernel_time = all_end - all_start;
+  totalTime += all_kernel_time.count();
+
+  }
+
+
+  else {
+  double xStep = coords[coordIdx].width / (double)width_;
+  double yStep = -coords[coordIdx].width / (double)width_;
+  double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
+  double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
+
+  // Time the kernel execution
+  auto all_start = std::chrono::steady_clock::now();
+
+  for (uint i = 0; i < numKernels; i++) {
+  (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
+                         threads_per_block, i);
+  }
+
+
+  // Synchronize all the concurrent streams to have completed execution
+  HIPCHECK(hipStreamSynchronize(0));
+
+  auto all_end = std::chrono::steady_clock::now();
+  std::chrono::duration<double> all_kernel_time = all_end - all_start;
+  totalTime += all_kernel_time.count();
+  }
+
+
+  }
+
+  // Copy data back from device to the host
+  for(uint i = 0; i < numKernels; i++) {
+    HIPCHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost));
+  }
+
+
+  for(uint i = 0; i < numKernels; i++) {
+  checkData(hPtr[i]);
+
+  int j =0;
+  while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
+          j++;
+  }
+
+  if(j==30) {
+    std::cout << "Incorrect iteration count detected. ";
+  }
+
+  }
+
+
+  // Compute GFLOPS.  There are 7 FLOPs per iteration
+  double perf = ((double)(totalIters*numKernels) * 7 * (double)(1e-09)) /
+                (totalTime / (double)numLoops);
+
+
+  std::vector<std::string> kernelName = {"float", "float_unroll",
+                      "double", "double_unroll"};
+
+  // Print results except for Warm-up kernel
+  if(testCase!=100) {
+  results[kernelName[testCase % 4]].push_back(perf);
+ }
+
+
+  for(uint i = 0 ; i < numStreams; i++) {
+    HIPCHECK(hipStreamDestroy(streams[i]));
+  }
+
+
+  // Free host and device memory
+  for (uint i = 0; i < numKernels; i++) {
+    HIPCHECK(hipFree(hPtr[i]));
+    HIPCHECK(hipFree(dPtr[i]));
+  }
+
+
+}
+
+
+void hipPerfMandelBrot::setData(void *ptr, unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  for (unsigned int i = 0; i < width_ * width_; i++) {
+      ptr2[i] = value;
+  }
+}
+
+
+void hipPerfMandelBrot::checkData(uint *ptr) {
+  totalIters = 0;
+  for (unsigned int i = 0; i < width_ * width_; i++) {
+    totalIters += ptr[i];
+  }
+}
+
+
+int main(int argc, char* argv[]) {
+  hipPerfMandelBrot mandelbrotCompute;
+  int deviceId = 0;
+
+  mandelbrotCompute.open(deviceId);
+
+  for (unsigned int testCase = 0; testCase < 3; testCase++) {
+
+
+  switch (testCase) {
+
+
+  case 0: {
+    // Warmup-kernel - default stream executes serially
+    mandelbrotCompute.setNumStreams(1);
+    mandelbrotCompute.setNumKernels(1);
+    mandelbrotCompute.run(100/*Random number*/, deviceId);
+    break;
+    }
+
+
+  case 1: {
+    // run all - sync
+    int i = 0;
+    do {
+    mandelbrotCompute.setNumStreams(1);
+    mandelbrotCompute.setNumKernels(1);
+    mandelbrotCompute.run(i, deviceId);
+    i++;
+    }while(i < 12);
+    mandelbrotCompute.printResults();
+
+    break;
+  }
+
+
+  case 2: {
+    // run all - async
+    int i = 0;
+    do {
+    mandelbrotCompute.setNumStreams(2);
+    mandelbrotCompute.setNumKernels(2);
+    mandelbrotCompute.run(i, deviceId);
+    i++;
+    }while(i < 12);
+    mandelbrotCompute.printResults();
+
+    break;
+
+  }
+
+
+  default: {
+    break;
+  }
+
+
+  }
+
+
+
+  }
+
+
+  passed();
+}
@@ -0,0 +1,289 @@
+/*
+ Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved.
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ */
+
+/* HIT_START
+ * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <iostream>
+#include <chrono>
+#include "test_common.h"
+
+typedef struct {
+  double x;
+  double y;
+  double width;
+} coordRec;
+
+static coordRec coords[] = {
+    {0.0, 0.0, 0.00001},         // All black
+};
+
+static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
+
+__global__ void mandelbrot(uint *out, uint width, float xPos,  float yPos, float xStep,
+                            float yStep, uint maxIter) {
+
+  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
+  int i = tid % width;
+  int j = tid / width;
+  float x0 = (float)(xPos + xStep*i);
+  float y0 = (float)(yPos + yStep*j);
+
+  float x = x0;
+  float y = y0;
+
+  uint iter = 0;
+  float tmp;
+  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
+    tmp = x;
+    x = fma(-y,y,fma(x,x,x0));
+    y = fma(2.0f*tmp,y,y0);
+  }
+
+  out[tid] = iter;
+};
+
+class hipPerfDeviceConcurrency {
+  public:
+  hipPerfDeviceConcurrency();
+  ~hipPerfDeviceConcurrency();
+
+  void setNumGpus(unsigned int num) {
+    numDevices = num;
+  }
+  unsigned int getNumGpus() {
+    return numDevices;
+  }
+
+  void open(void);
+  void close(void);
+  void run(unsigned int testCase, int numGpus);
+
+  private:
+  void setData(void *ptr, unsigned int value);
+  void checkData(uint *ptr);
+
+  unsigned int numDevices;
+  unsigned int width_;
+  unsigned int bufSize;
+  unsigned int coordIdx;
+  unsigned long long totalIters = 0;
+};
+
+
+hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {}
+
+hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {}
+
+void hipPerfDeviceConcurrency::open(void) {
+
+
+  int nGpu = 0;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+  setNumGpus(nGpu);
+  if (nGpu < 1) {
+  std::cout << "info: didn't find any GPU! skipping the test!\n";
+  passed();
+  }
+
+
+}
+
+
+void hipPerfDeviceConcurrency::close() {
+}
+
+void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
+
+
+  static int deviceId;
+  uint * hPtr[numGpus];
+  uint * dPtr[numGpus];
+  hipStream_t streams[numGpus];
+  int numCUs[numGpus];
+  unsigned int maxIter[numGpus];
+  unsigned long long expectedIters[numGpus];
+
+  int threads, threads_per_block, blocks;
+  float xStep, yStep, xPos, yPos;
+
+  for(int i = 0; i < numGpus; i++) {
+
+  if(testCase != 0) {
+    deviceId = i;
+  }
+
+  HIPCHECK(hipSetDevice(deviceId));
+
+  hipDeviceProp_t props = {0};
+  HIPCHECK(hipGetDeviceProperties(&props, i));
+
+  if (testCase != 0) {
+  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
+            << " with " << props.multiProcessorCount << " CUs" << " and device ID: "
+            << i << std::endl;
+  }
+
+  numCUs[i] = props.multiProcessorCount;
+  int clkFrequency = 0;
+  HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i));
+
+  clkFrequency =(unsigned int)clkFrequency/1000;
+
+  // Maximum iteration count
+  // maxIter = 8388608 * (engine_clock / 1000).serial execution
+  maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128);
+  maxIter[i] = (maxIter[i] + 15) & ~15;
+
+  // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
+  width_ = 256;
+
+  bufSize = width_ * width_ * sizeof(uint);
+
+  // Create streams for concurrency
+  HIPCHECK(hipStreamCreate(&streams[i]));
+
+  // Allocate memory on the host and device
+  HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
+  setData(hPtr[i], 0xdeadbeef);
+  HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
+
+  // Prepare kernel launch parameters
+  threads = (bufSize/sizeof(uint));
+  threads_per_block  = 64;
+  blocks = (threads/threads_per_block) + (threads % threads_per_block);
+
+  coordIdx = testCase % numCoords;
+  xStep = (float)(coords[coordIdx].width / (double)width_);
+  yStep = (float)(-coords[coordIdx].width / (double)width_);
+  xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
+  yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
+
+  // Copy memory from host to device
+  HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
+
+  }
+
+  // Time the kernel execution
+  auto all_start = std::chrono::steady_clock::now();
+
+  for(int i = 0; i < numGpus; i++) {
+
+  if(testCase != 0) {
+    deviceId = i;
+  }
+
+  HIPCHECK(hipSetDevice(deviceId));
+
+  hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i],
+                      dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]);
+
+  }
+
+  for(int i = 0; i < numGpus; i++) {
+    HIPCHECK(hipStreamSynchronize(0));
+  }
+
+
+  auto all_end = std::chrono::steady_clock::now();
+  std::chrono::duration<double> all_kernel_time = all_end - all_start;
+
+  for(int i = 0; i < numGpus; i++) {
+
+  if(testCase != 0) {
+    deviceId = i;
+  }
+  HIPCHECK(hipSetDevice(deviceId));
+
+  // Copy data back from device to the host
+  HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
+
+  checkData(hPtr[i]);
+  expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i];
+
+  if (testCase != 0) {
+    checkData(hPtr[i]);
+    if(totalIters != expectedIters[i]) {
+      std::cout << "Incorrect iteration count detected" << std::endl;
+    }
+  }
+
+
+  HIPCHECK(hipStreamDestroy(streams[i]));
+
+  // Free host and device memory
+  HIPCHECK(hipFree(hPtr[i]));
+  HIPCHECK(hipFree(dPtr[i]));
+  }
+
+  if (testCase != 0) {
+  std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): "
+            << all_kernel_time.count() << " (s) " << '\n' << std::endl;
+  }
+
+  if(testCase == 0) {
+    deviceId++;
+  }
+
+
+}
+
+
+void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  for (unsigned int i = 0; i < width_ * width_ ; i++) {
+      ptr2[i] = value;
+  }
+}
+
+
+void hipPerfDeviceConcurrency::checkData(uint *ptr) {
+  totalIters = 0;
+  for (unsigned int i = 0; i < width_ * width_; i++) {
+    totalIters += ptr[i];
+  }
+}
+
+
+int main(int argc, char* argv[]) {
+  hipPerfDeviceConcurrency deviceConcurrency;
+
+  deviceConcurrency.open();
+
+  int nGpu = deviceConcurrency.getNumGpus();
+
+  // testCase = 0 refers to warmup kernel run
+  int testCase = 0;
+
+  for (int i = 0; i < nGpu; i++) {
+    // Warm-up kernel on all devices
+    deviceConcurrency.run(testCase, 1);
+  }
+
+  // Time for kernel on 1 device
+  deviceConcurrency.run(++testCase, 1);
+
+  // Time for kernel on all available devices
+  deviceConcurrency.run(++testCase, nGpu);
+
+  passed();
+}
@@ -0,0 +1,20 @@
+project(bit_extract)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+# Create the excutable
+add_executable(bit_extract bit_extract.cpp)
+
+# Link with HIP
+target_link_libraries(bit_extract hip::host)
@@ -9,19 +9,15 @@ HIPCC=$(HIP_PATH)/bin/hipcc

 # Show how to use PLATFORM to specify different options for each compiler:
 ifeq (${HIP_PLATFORM}, nvcc)
-	HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 
+	HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20
 endif

 EXE=bit_extract
-EXE_STATIC=bit_extract_static

 $(EXE): bit_extract.cpp
 	$(HIPCC) $(HIPCC_FLAGS) $< -o $@

-$(EXE_STATIC): bit_extract.cpp
-	$(HIPCC) -use-staticlib  $(HIPCC_FLAGS) $< -o $@
-
-all: $(EXE) $(EXE_STATIC)
+all: $(EXE)

 clean:
-	rm -f *.o $(EXE) $(EXE_STATIC)
+	rm -f *.o $(EXE)
@@ -0,0 +1,36 @@
+project(module_api)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+# Create the excutable
+add_executable(runKernel.hip.out runKernel.cpp)
+add_executable(launchKernelHcc.hip.out launchKernelHcc.cpp)
+add_executable(defaultDriver.hip.out defaultDriver.cpp)
+
+# Generate code object
+add_custom_target(
+  codeobj
+  ALL
+  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../vcpy_kernel.cpp -o vcpy_kernel.code
+  COMMENT "codeobj generated"
+)
+
+add_dependencies(runKernel.hip.out codeobj)
+add_dependencies(launchKernelHcc.hip.out codeobj)
+add_dependencies(defaultDriver.hip.out codeobj)
+
+# Link with HIP
+target_link_libraries(runKernel.hip.out hip::host)
+target_link_libraries(launchKernelHcc.hip.out hip::host)
+target_link_libraries(defaultDriver.hip.out hip::host)
@@ -0,0 +1,30 @@
+project(modile_api_global)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+# Create the excutable
+add_executable(runKernel.hip.out runKernel.cpp)
+
+# Generate code object
+add_custom_target(
+  codeobj
+  ALL
+  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../vcpy_kernel.cpp -o vcpy_kernel.code
+  COMMENT "codeobj generated"
+)
+
+add_dependencies(runKernel.hip.out codeobj)
+
+# Link with HIP
+target_link_libraries(runKernel.hip.out hip::host)
@@ -0,0 +1,21 @@
+#Follow "README.md" to generate square.cpp if it's missing
+
+project(square)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+# Create the excutable
+add_executable(square square.cpp)
+
+# Link with HIP
+target_link_libraries(square hip::host)
@@ -11,7 +11,7 @@ else
 	SOURCES=square.cpp
 endif

-all: square.out square.out.static
+all: square.out

 # Step
 square.cpp: square.cu
@@ -20,8 +20,5 @@ square.cpp: square.cu
 square.out: $(SOURCES)
 	$(HIPCC) $(CXXFLAGS) $(SOURCES) -o $@

-square.out.static: $(SOURCES)
-	$(HIPCC) -use-staticlib $(CXXFLAGS) $(SOURCES) -o $@
-
 clean:
-	rm -f *.o *.out *.out.static square.cpp
+	rm -f *.o *.out square.cpp
@@ -1,13 +1,39 @@
 # Square.md

-Simple test which shows how to use hipify-perl to port CUDA code to HIP.  
-See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example. 
+Simple test which shows how to use hipify-perl to port CUDA code to HIP.
+See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example.
 Now it is even simpler and requires no manual modification to the hipified source code - just hipify and compile:

-1. Add hip/bin path to the PATH  :
-    <code>export PATH=$PATH:[MYHIP]/bin</code>
+- Add hip/bin path to the PATH

-2. <code>$ make </code>
-   Make runs these steps.  This can be performed on either CUDA or AMD platform:
-   <code>hipify-perl square.cu > square.cpp </code>    # convert cuda code to hip code
-   <code>hipcc square.cpp</code>                       # compile into executable
+```
+$ export PATH=$PATH:[MYHIP]/bin
+```
+
+- Define environment variable
+
+```
+$ export HIP_PATH=[MYHIP]
+```
+
+- Build executible file
+
+```
+$ cd ~/hip/samples/0_Intro/square
+$ make
+/home/user/hip/bin/hipify-perl square.cu > square.cpp
+/home/user/hip/bin/hipcc  square.cpp -o square.out
+/home/user/hip/bin/hipcc -use-staticlib  square.cpp -o square.out.static
+```
+- Execute file
+```
+$ ./square.out
+info: running on device Navi 14 [Radeon Pro W5500]
+info: allocate host mem (  7.63 MB)
+info: allocate device mem (  7.63 MB)
+info: copy Host2Device
+info: launch 'vector_square' kernel
+info: copy Device2Host
+info: check result
+PASSED!
+```
@@ -0,0 +1,20 @@
+project(hipBusBandwidth)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(hipBusBandwidth hipBusBandwidth.cpp ResultDatabase.cpp)
+
+# Link with HIP
+target_link_libraries(hipBusBandwidth hip::host)
@@ -12,7 +12,7 @@ enum MallocMode { MallocPinned, MallocUnpinned, MallocRegistered };
 bool p_verbose = false;
 MallocMode p_malloc_mode = MallocPinned;
 int p_numa_ctl = -1;
-int p_iterations = 10;
+int p_iterations = 0;
 int p_beatsperiteration = 1;
 int p_device = 0;
 int p_detailed = 0;
@@ -89,7 +89,9 @@ hipError_t memcopy(void* dst, const void* src, size_t sizeBytes, enum hipMemcpyK
 int sizes[] = {-64, -256, -512, 1,    2,    4,     8,     16,    32,     64,     128,   256,
               512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288};
 int nSizes = sizeof(sizes) / sizeof(int);
-
+// iterations to be run for the corresponding sizes, less number as the size increases
+int iterations[] = {1000, 1000, 1000, 1000, 500, 500, 500, 500, 500, 200, 200, 200,
+               200, 200, 100, 100, 100, 100, 50, 50, 50, 20, 20};

 // ****************************************************************************
 // Function: RunBenchmark_H2D
@@ -174,53 +176,48 @@ void RunBenchmark_H2D(ResultDatabase& resultDB) {
    hipEventCreate(&stop);
    CHECK_HIP_ERROR();

-    // Three passes, forward and backward both
-    for (int pass = 0; pass < p_iterations; pass++) {
-        // store the times temporarily to estimate latency
-        // float times[nSizes];
-        // Step through sizes forward on even passes and backward on odd
-        for (int i = 0; i < nSizes; i++) {
-            int sizeIndex;
-            if ((pass % 2) == 0)
-                sizeIndex = i;
-            else
-                sizeIndex = (nSizes - 1) - i;
+    // store the times temporarily to estimate latency
+    // float times[nSizes];
+    for (int i = 0; i < nSizes; i++) {
+      int sizeIndex, iterIndex;
+      sizeIndex = i;
+      iterIndex = i;

-            const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-            const int nbytes = sizeToBytes(thisSize);
+      const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+      const int nbytes = sizeToBytes(thisSize);
+      const int niter = p_iterations ? p_iterations : iterations[iterIndex];
+      for (int pass = 0; pass < niter; pass++) {

-            hipEventRecord(start, 0);
-            for (int j = 0; j < p_beatsperiteration; j++) {
-                memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice);
-            }
-            hipEventRecord(stop, 0);
-            hipEventSynchronize(stop);
-            float t = 0;
-            hipEventElapsedTime(&t, start, stop);
-            // times[sizeIndex] = t;
-
-            // Convert to GB/sec
-            if (p_verbose) {
-                std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
-            }
-
-            double speed =
-                (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) /  1000) / t;
-            char sizeStr[256];
-            if (p_beatsperiteration > 1) {
-                sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
-            } else {
-                sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-            }
-            resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
-                               sizeStr, "GB/sec", speed);
-            resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr,
-                               "ms", t);
-
-            if (p_onesize) {
-                break;
-            }
+        hipEventRecord(start, 0);
+        for (int j = 0; j < p_beatsperiteration; j++) {
+          memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice);
        }
+        hipEventRecord(stop, 0);
+        hipEventSynchronize(stop);
+        float t = 0;
+        hipEventElapsedTime(&t, start, stop);
+        // times[sizeIndex] = t;
+        // Convert to GB/sec
+        if (p_verbose) {
+          std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+        }
+
+        double speed =
+            (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) /  1000) / t;
+        char sizeStr[256];
+        if (p_beatsperiteration > 1) {
+          sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
+        } else {
+          sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+        }
+        resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
+                           sizeStr, "GB/sec", speed);
+        resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr,                                            "ms", t);
+
+      }
+      if (p_onesize) {
+        break;
+      }
    }

    if (p_onesize) {
@@ -347,53 +344,50 @@ void RunBenchmark_D2H(ResultDatabase& resultDB) {
    hipEventCreate(&stop);
    CHECK_HIP_ERROR();

-    // Three passes, forward and backward both
-    for (int pass = 0; pass < p_iterations; pass++) {
-        // store the times temporarily to estimate latency
-        // float times[nSizes];
-        // Step through sizes forward on even passes and backward on odd
-        for (int i = 0; i < nSizes; i++) {
-            int sizeIndex;
-            if ((pass % 2) == 0)
-                sizeIndex = i;
-            else
-                sizeIndex = (nSizes - 1) - i;
+    // store the times temporarily to estimate latency
+    // float times[nSizes];
+    for (int i = 0; i < nSizes; i++) {
+      int sizeIndex, iterIndex;
+      sizeIndex = i;
+      iterIndex = i;

-            const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-            const int nbytes = sizeToBytes(thisSize);
+      const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+      const int nbytes = sizeToBytes(thisSize);
+      const int niter = p_iterations ? p_iterations : iterations[iterIndex];
+      for (int pass = 0; pass < niter; pass++) {

-            hipEventRecord(start, 0);
-            for (int j = 0; j < p_beatsperiteration; j++) {
-                memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost);
-            }
-            hipEventRecord(stop, 0);
-            hipEventSynchronize(stop);
-            float t = 0;
-            hipEventElapsedTime(&t, start, stop);
-            // times[sizeIndex] = t;
-
-            // Convert to GB/sec
-            if (p_verbose) {
-                std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
-            }
-
-            double speed =
-                (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
-            char sizeStr[256];
-            sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-            if (p_beatsperiteration > 1) {
-                sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
-            } else {
-                sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-            }
-            resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
-                               sizeStr, "GB/sec", speed);
-            resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode),
-                               sizeStr, "ms", t);
-            if (p_onesize) {
-                break;
-            }
+        hipEventRecord(start, 0);
+        for (int j = 0; j < p_beatsperiteration; j++) {
+          memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost);
        }
+        hipEventRecord(stop, 0);
+        hipEventSynchronize(stop);
+        float t = 0;
+        hipEventElapsedTime(&t, start, stop);
+        // times[sizeIndex] = t;
+        // Convert to GB/sec
+        if (p_verbose) {
+          std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+        }
+
+        double speed =
+            (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
+        char sizeStr[256];
+        sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+        if (p_beatsperiteration > 1) {
+          sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
+        } else {
+          sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+        }
+        resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
+                           sizeStr, "GB/sec", speed);
+        resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode),
+                           sizeStr, "ms", t);
+
+      }
+      if (p_onesize) {
+        break;
+      }
    }

    if (p_onesize) {
@@ -522,43 +516,43 @@ void RunBenchmark_Bidir(ResultDatabase& resultDB) {
    hipStreamCreate(&stream[0]);
    hipStreamCreate(&stream[1]);

-    // Three passes, forward and backward both
-    for (int pass = 0; pass < p_iterations; pass++) {
-        // store the times temporarily to estimate latency
-        // float times[nSizes];
-        // Step through sizes forward on even passes and backward on odd
-        for (int i = 0; i < nSizes; i++) {
-            int sizeIndex;
-            if ((pass % 2) == 0)
-                sizeIndex = i;
-            else
-                sizeIndex = (nSizes - 1) - i;
+    // store the times temporarily to estimate latency
+    // float times[nSizes];
+    for (int i = 0; i < nSizes; i++) {
+      int sizeIndex, iterIndex;
+      sizeIndex = i;
+      iterIndex = i;

-            const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-            const int nbytes = sizeToBytes(thisSize);
+      const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+      const int nbytes = sizeToBytes(thisSize);
+      const int niter = p_iterations ? p_iterations : iterations[iterIndex];
+      for (int pass = 0; pass < niter; pass++) {

-            hipEventRecord(start, 0);
-            hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]);
-            hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]);
-            hipEventRecord(stop, 0);
-            hipEventSynchronize(stop);
-            float t = 0;
-            hipEventElapsedTime(&t, start, stop);
+        hipEventRecord(start, 0);
+        hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]);
+        hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]);
+        hipEventRecord(stop, 0);
+        hipEventSynchronize(stop);
+        float t = 0;
+        hipEventElapsedTime(&t, start, stop);

-            // Convert to GB/sec
-            if (p_verbose) {
-                std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
-            }
-
-            double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t;
-            char sizeStr[256];
-            sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-            resultDB.AddResult(
-                std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr,
-                "GB/sec", speed);
-            resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode),
-                               sizeStr, "ms", t);
+        // Convert to GB/sec
+        if (p_verbose) {
+          std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
        }
+
+        double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t;
+        char sizeStr[256];
+        sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+        resultDB.AddResult(
+            std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr,
+            "GB/sec", speed);
+        resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode),
+                           sizeStr, "ms", t);
+      }
+      if (p_onesize) {
+        break;
+      }
    }

    // Cleanup
@@ -708,66 +702,63 @@ void RunBenchmark_P2P_Unidir(ResultDatabase& resultDB) {
            hipEventCreate(&stop);
            CHECK_HIP_ERROR();

-            // Three passes, forward and backward both
-            for (int pass = 0; pass < p_iterations; pass++) {
-                // store the times temporarily to estimate latency
-                // float times[nSizes];
-                // Step through sizes forward on even passes and backward on odd
-                for (int i = 0; i < nSizes; i++) {
-                    int sizeIndex;
-                    if ((pass % 2) == 0)
-                        sizeIndex = i;
-                    else
-                        sizeIndex = (nSizes - 1) - i;
+            // store the times temporarily to estimate latency
+            // float times[nSizes];
+            for (int i = 0; i < nSizes; i++) {
+              int sizeIndex, iterIndex;
+              sizeIndex = i;
+              iterIndex = i;

-                    const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-                    const int nbytes = sizeToBytes(thisSize);
+              const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+              const int nbytes = sizeToBytes(thisSize);
+              const int niter = p_iterations ? p_iterations : iterations[iterIndex];
+              for (int pass = 0; pass < niter; pass++) {

-                    hipDeviceSynchronize();
+                hipDeviceSynchronize();

-                    hipEventRecord(start, 0);
+                hipEventRecord(start, 0);

-                    for (int j = 0; j < p_beatsperiteration; j++) {
-                        hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice);
-                    }
+                for (int j = 0; j < p_beatsperiteration; j++) {
+                  hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice);
+                }

-                    hipEventRecord(stop, 0);
+                hipEventRecord(stop, 0);

-                    hipEventSynchronize(stop);
+                hipEventSynchronize(stop);

-                    float t = 0;
-                    hipEventElapsedTime(&t, start, stop);
-                    // times[sizeIndex] = t;
+                float t = 0;
+                hipEventElapsedTime(&t, start, stop);
+                // times[sizeIndex] = t;

-                    // Convert to GB/sec
-                    if (p_verbose) {
-                        std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
-                    }
+                // Convert to GB/sec
+                if (p_verbose) {
+                  std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+                }

-                    double speed =
-                        (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
-                    char sizeStr[256];
-                    if (p_beatsperiteration > 1) {
-                        sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
-                                p_beatsperiteration);
-                    } else {
-                        sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-                    }
+                double speed =
+                    (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
+                char sizeStr[256];
+                if (p_beatsperiteration > 1) {
+                  sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
+                          p_beatsperiteration);
+                } else {
+                  sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+                }

-                    string cGpu, pGpu;
-                    cGpu = gpuIDToString(currentGpu);
-                    pGpu = gpuIDToString(peerGpu);
+                string cGpu, pGpu;
+                cGpu = gpuIDToString(currentGpu);
+                pGpu = gpuIDToString(peerGpu);

-                    resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) +
-                                           "_gpu" + std::string(pGpu),
+                resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) +
+                                       "_gpu" + std::string(pGpu),
                                       sizeStr, "GB/sec", speed);
-                    resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) +
-                                           "_gpu" + std::string(pGpu),
+                resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) +
+                                       "_gpu" + std::string(pGpu),
                                       sizeStr, "ms", t);

-                    if (p_onesize) {
-                        break;
-                    }
+              }
+                if (p_onesize) {
+                  break;
                }
            }

@@ -829,71 +820,68 @@ void RunBenchmark_P2P_Bidir(ResultDatabase& resultDB) {
            hipStreamCreate(&stream[0]);
            hipStreamCreate(&stream[1]);

-            // Three passes, forward and backward both
-            for (int pass = 0; pass < p_iterations; pass++) {
-                // store the times temporarily to estimate latency
-                // float times[nSizes];
-                // Step through sizes forward on even passes and backward on odd
-                for (int i = 0; i < nSizes; i++) {
-                    int sizeIndex;
-                    if ((pass % 2) == 0)
-                        sizeIndex = i;
-                    else
-                        sizeIndex = (nSizes - 1) - i;
+            // store the times temporarily to estimate latency
+            // float times[nSizes];
+            for (int i = 0; i < nSizes; i++) {
+              int sizeIndex, iterIndex;
+              sizeIndex = i;
+              iterIndex = i;

-                    const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-                    const int nbytes = sizeToBytes(thisSize);
+              const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+              const int nbytes = sizeToBytes(thisSize);
+              const int niter = p_iterations ? p_iterations : iterations[iterIndex];
+              for (int pass = 0; pass < niter; pass++) {

-                    hipDeviceSynchronize();
+                hipDeviceSynchronize();

-                    hipEventRecord(start, 0);
+                hipEventRecord(start, 0);

-                    for (int j = 0; j < p_beatsperiteration; j++) {
-                        hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes,
-                                       hipMemcpyDeviceToDevice, stream[0]);
-                        hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes,
-                                       hipMemcpyDeviceToDevice, stream[1]);
-                    }
-
-                    hipEventRecord(stop, 0);
-
-                    hipEventSynchronize(stop);
-
-                    float t = 0;
-                    hipEventElapsedTime(&t, start, stop);
-                    // times[sizeIndex] = t;
-
-                    // Convert to GB/sec
-                    if (p_verbose) {
-                        std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
-                    }
-
-                    double speed =
-                        (double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) /
-                        t;
-                    char sizeStr[256];
-                    if (p_beatsperiteration > 1) {
-                        sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
-                                p_beatsperiteration);
-                    } else {
-                        sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-                    }
-
-                    string cGpu, pGpu;
-                    cGpu = gpuIDToString(currentGpu);
-                    pGpu = gpuIDToString(peerGpu);
-
-                    resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
-                                           std::string(pGpu),
-                                       sizeStr, "GB/sec", speed);
-                    resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
-                                           std::string(pGpu),
-                                       sizeStr, "ms", t);
-
-                    if (p_onesize) {
-                        break;
-                    }
+                for (int j = 0; j < p_beatsperiteration; j++) {
+                  hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes,
+                                 hipMemcpyDeviceToDevice, stream[0]);
+                  hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes,
+                                 hipMemcpyDeviceToDevice, stream[1]);
                }
+
+                hipEventRecord(stop, 0);
+
+                hipEventSynchronize(stop);
+
+                float t = 0;
+                hipEventElapsedTime(&t, start, stop);
+                // times[sizeIndex] = t;
+
+                // Convert to GB/sec
+                if (p_verbose) {
+                  std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+                }
+
+                double speed =
+                    (double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) /
+                    t;
+                char sizeStr[256];
+                if (p_beatsperiteration > 1) {
+                  sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
+                          p_beatsperiteration);
+                } else {
+                  sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+                }
+
+                string cGpu, pGpu;
+                cGpu = gpuIDToString(currentGpu);
+                pGpu = gpuIDToString(peerGpu);
+
+                resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
+                                       std::string(pGpu),
+                                   sizeStr, "GB/sec", speed);
+                resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
+                                       std::string(pGpu),
+                                   sizeStr, "ms", t);
+
+              }
+              if (p_onesize) {
+                break;
+              }
            }

            if (p_onesize) {
@@ -0,0 +1,31 @@
+project(hipCommander)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(hipCommander hipCommander.cpp)
+
+# Generate code object
+add_custom_target(
+  codeobj
+  ALL
+  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../nullkernel.hip.cpp -o nullkernel.hsaco
+  COMMENT "codeobj generated"
+)
+
+add_dependencies(hipCommander codeobj)
+
+# Link with HIP
+target_link_libraries(hipCommander hip::host)
+set_property(TARGET hipCommander PROPERTY CXX_STANDARD 11)
@@ -0,0 +1,35 @@
+project(hipDispatchLatency)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(hipDispatchLatency hipDispatchLatency.cpp)
+add_executable(hipDispatchEnqueueRateMT hipDispatchEnqueueRateMT.cpp)
+
+# Generate code object
+add_custom_target(
+  codeobj
+  ALL
+  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../test_kernel.cpp -o test_kernel.code
+  COMMENT "codeobj generated"
+)
+
+add_dependencies(hipDispatchLatency codeobj)
+add_dependencies(hipDispatchEnqueueRateMT codeobj)
+
+# Link with HIP
+target_link_libraries(hipDispatchLatency hip::host)
+target_link_libraries(hipDispatchEnqueueRateMT hip::host)
+set_property(TARGET hipDispatchLatency PROPERTY CXX_STANDARD 11)
+set_property(TARGET hipDispatchEnqueueRateMT PROPERTY CXX_STANDARD 11)
@@ -0,0 +1,20 @@
+project(hipInfo)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(hipInfo hipInfo.cpp)
+
+# Link with HIP
+target_link_libraries(hipInfo hip::host)
@@ -0,0 +1,20 @@
+project(MatrixTranspose)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(MatrixTranspose MatrixTranspose.cpp)
+
+# Link with HIP
+target_link_libraries(MatrixTranspose hip::host)
@@ -0,0 +1,20 @@
+project(inline_asm)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(inline_asm inline_asm.cpp)
+
+# Link with HIP
+target_link_libraries(inline_asm hip::host)
@@ -0,0 +1,30 @@
+project(texture2dDrv)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(texture2dDrv texture2dDrv.cpp)
+
+# Generate code object
+add_custom_target(
+  codeobj
+  ALL
+  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../tex2dKernel.cpp -o tex2dKernel.code
+  COMMENT "codeobj generated"
+)
+
+add_dependencies(texture2dDrv codeobj)
+
+# Link with HIP
+target_link_libraries(texture2dDrv hip::host)
@@ -0,0 +1,20 @@
+project(occupancy)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(occupancy occupancy.cpp)
+
+# Link with HIP
+target_link_libraries(occupancy hip::host)
@@ -0,0 +1,20 @@
+project(hipEvent)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(hipEvent hipEvent.cpp)
+
+# Link with HIP
+target_link_libraries(hipEvent hip::host)
@@ -0,0 +1,20 @@
+project(sharedMemory)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(sharedMemory sharedMemory.cpp)
+
+# Link with HIP
+target_link_libraries(sharedMemory hip::host)
@@ -0,0 +1,20 @@
+project(shfl)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(shfl shfl.cpp)
+
+# Link with HIP
+target_link_libraries(shfl hip::host)
@@ -0,0 +1,19 @@
+project(2dshfl)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+# Create the excutable
+add_executable(2dshfl 2dshfl.cpp)
+
+# Link with HIP
+target_link_libraries(2dshfl hip::host)
@@ -0,0 +1,19 @@
+project(dynamic_shared)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+# Create the excutable
+add_executable(dynamic_shared dynamic_shared.cpp)
+
+# Link with HIP
+target_link_libraries(dynamic_shared hip::host)
@@ -0,0 +1,19 @@
+project(stream)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+# Create the excutable
+add_executable(stream stream.cpp)
+
+# Link with HIP
+target_link_libraries(stream hip::host)
@@ -0,0 +1,19 @@
+project(peer2peer)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+# Create the excutable
+add_executable(peer2peer peer2peer.cpp)
+
+# Link with HIP
+target_link_libraries(peer2peer hip::host)
@@ -0,0 +1,19 @@
+project(unroll)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+# Create the excutable
+add_executable(unroll unroll.cpp)
+
+# Link with HIP
+target_link_libraries(unroll hip::host)
@@ -0,0 +1,27 @@
+Build procedure
+
+We provide Makefile and CMakeLists.txt to build the samples seperately.
+
+1.Makefile supports shared lib of hip-rocclr runtime and nvcc.
+
+To build a sample, just type in sample folder,
+
+make
+
+
+
+2.CMakeLists.txt can support shared and static libs of hip-rocclr runtime.
+
+To build a sample, type in sample folder,
+
+mkdir build (if build folder is missing)
+
+cd build
+
+cmake ..
+
+make
+
+If you want debug version, follow,
+
+cmake -DCMAKE_BUILD_TYPE=Debug ..