Revert "Merge branch 'amd-master-next' into amd-npi-next"

This reverts commit 873a2e33d3. Reason for revert: <INSERT REASONING HERE> Change-Id: I53322718dadde2c98f96140b8e260ec7ee9ef721 [ROCm/hip-tests commit: 051c84bba3]
2020-10-05 12:26:18 -04:00
@@ -1,747 +0,0 @@
-/*
- Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-#include <hip/hip_vector_types.h>
-#include <hip/math_functions.h>
-#include <vector>
-#include <string>
-#include <map>
-
-typedef struct {
-  double x;
-  double y;
-  double width;
-} coordRec;
-
-coordRec coords[] = {
-    {0.0, 0.0, 4.0},                                     // Whole set
-    {0.0, 0.0, 0.00001},                                 // All black
-    {-0.0180789661868, 0.6424294066162, 0.00003824140},  // Hit detail
-};
-
-static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
-
-template <typename T>
-__global__ void float_mad_kernel(uint *out, uint width, T xPos,  T yPos, T xStep, T yStep,
-                                  uint maxIter) {
-
-#pragma FP_CONTRACT ON
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % width;
-  int j = tid / width;
-  float x0 = (float)(xPos + xStep*i);
-  float y0 = (float)(yPos + yStep*j);
-
-  float x = x0;
-  float y = y0;
-
-  uint iter = 0;
-  float tmp;
-  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
-    tmp = x;
-    x = fma(-y,y,fma(x,x,x0));
-    y = fma(2.0f*tmp,y,y0);
-  }
-
-  out[tid] = iter;
-};
-
-template <typename T>
-__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
-    T yPos, T xStep, T yStep, uint maxIter) {
-
-#pragma FP_CONTRACT ON
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % width;
-  int j = tid / width;
-  float x0 = (float)(xPos + xStep*(float)i);
-  float y0 = (float)(yPos + yStep*(float)j);
-
-  float x = x0;
-  float y = y0;
-
-#define FAST
-  uint iter = 0;
-  float tmp;
-  int stay;
-  int ccount = 0;
-  stay = (x*x+y*y) <= 4.0;
-  float savx = x;
-  float savy = y;
-#ifdef FAST
-  for (iter = 0; (iter < maxIter); iter+=16) {
-#else
-  for (iter = 0; stay && (iter < maxIter); iter+=16) {
-#endif
-    x = savx;
-    y = savy;
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    stay = (x*x+y*y) <= 4.0;
-    savx = (stay ? x : savx);
-    savy = (stay ? y : savy);
-    ccount += stay*16;
-#ifdef FAST
-    if (!stay)
-      break;
-#endif
-  }
-  // Handle remainder
-  if (!stay) {
-    iter = 16;
-    do {
-      x = savx;
-      y = savy;
-      stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
-      tmp = x;
-      x =  fma(-y,y, fma(x,x,x0));
-      y =  fma(2.0f*tmp,y,y0);
-      ccount += stay;
-      iter--;
-      savx = (stay ? x : savx);
-      savy = (stay ? y : savy);
-    } while (stay && iter);
-  }
-
-
-  out[tid] = (uint)ccount;
-
-};
-
-
-template <typename T>
-__global__ void double_mad_kernel(uint *out, uint width, T xPos,  T yPos, T xStep, T yStep,
-                                   uint maxIter) {
-
-#pragma FP_CONTRACT ON
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % width;
-  int j = tid / width;
-  double x0 = (double)(xPos + xStep*i);
-  double y0 = (double)(yPos + yStep*j);
-
-  double x = x0;
-  double y = y0;
-
-  uint iter = 0;
-  double tmp;
-  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
-    tmp = x;
-    x = fma(-y,y,fma(x,x,x0));
-    y = fma(2.0f*tmp,y,y0);
-  }
-  out[tid] = iter;
-};
-
-
-template <typename T>
-__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
-                  T yPos, T xStep, T yStep, uint maxIter) {
-
-#pragma FP_CONTRACT ON
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-
-  int i = tid % width;
-  int j = tid / width;
-  double x0 = (double)(xPos + xStep*(double)i);
-  double y0 = (double)(yPos + yStep*(double)j);
-
-  double x = x0;
-  double y = y0;
-
-#define FAST
-  uint iter = 0;
-  double tmp;
-  int stay;
-  int ccount = 0;
-  stay = (x*x+y*y) <= 4.0;
-  double savx = x;
-  double savy = y;
-#ifdef FAST
-  for (iter = 0; (iter < maxIter); iter+=16)
-#else
-  for (iter = 0; stay && (iter < maxIter); iter+=16)
-#endif
-  {
-    x = savx;
-    y = savy;
-
-    // Two iterations
-    tmp = fma(-y,y, fma(x,x,x0));
-    y =   fma(2.0f*x,y,y0);
-    x =   fma(-y,y, fma(tmp,tmp,x0));
-    y =   fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp = fma(-y,y, fma(x,x,x0));
-    y =   fma(2.0f*x,y,y0);
-    x =   fma(-y,y, fma(tmp,tmp,x0));
-    y =   fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp = fma(-y,y, fma(x,x,x0));
-    y =   fma(2.0f*x,y,y0);
-    x =   fma(-y,y, fma(tmp,tmp,x0));
-    y =   fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    stay = (x*x+y*y) <= 4.0;
-    savx = (stay ? x : savx);
-    savy = (stay ? y : savy);
-    ccount += stay*16;
-#ifdef FAST
-    if (!stay)
-      break;
-#endif
-    }
-  // Handle remainder
-    if (!stay) {
-      iter = 16;
-      do {
-        x = savx;
-        y = savy;
-        stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
-        tmp = x;
-        x =  fma(-y,y, fma(x,x,x0));
-        y =  fma(2.0f*tmp,y,y0);
-        ccount += stay;
-        iter--;
-        savx = (stay ? x : savx);
-        savy = (stay ? y : savy);
-      }
-      while (stay && iter);
-
-    }
-    out[tid] = (uint)ccount;
-};
-
-static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15;
-
-// Expected results for each kernel run at each coord
-unsigned long long expectedIters[] = {
-    203277748ull,  2147483648ull, 120254651ull,  203277748ull,  2147483648ull,
-    120254651ull,  203277748ull,  2147483648ull, 120254651ull,  203315114ull,
-    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull,
-    203280620ull,  2147483648ull, 120485704ull,  203280620ull,  2147483648ull,
-    120485704ull,  203280620ull,  2147483648ull, 120485704ull,  203315114ull,
-    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull};
-
-class hipPerfMandelBrot {
-  public:
-  hipPerfMandelBrot();
-  ~hipPerfMandelBrot();
-
-  void setNumKernels(unsigned int num) {
-    numKernels = num;
-  }
-
-  unsigned int getNumKernels() {
-    return numKernels;
-  }
-
-  void setNumStreams(unsigned int num) {
-    numStreams = num;
-  }
-  unsigned int getNumStreams() {
-    return numStreams;
-  }
-
-  void open(int deviceID);
-  void run(unsigned int testCase, unsigned int deviceId);
-  void printResults(void);
-
-  // array of funtion pointers
-  typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos,  float yPos,
-                 float xStep, float yStep, uint maxIter,  hipStream_t* streams, int blocks,
-                 int threads_per_block, int kernelCnt);
-
-  // Wrappers
-  void float_mad(uint *out, uint width, float xPos,  float yPos,
-                  float xStep, float yStep, uint maxIter, hipStream_t* streams,
-                  int blocks, int threads_per_block, int kernelCnt);
-
-  void float_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
-                            float xStep, float yStep, uint maxIter, hipStream_t* streams,
-                            int blocks, int threads_per_block, int kernelCnt);
-
-  void double_mad(uint *out, uint width, float xPos,  float yPos, float xStep,
-                   float yStep, uint maxIter, hipStream_t* streams, int blocks,
-                   int threads_per_block, int kernelCnt);
-
-  void double_mandel_unroll(uint *out, uint width, float xPos,  float yPos, float xStep,
-                             float yStep, uint maxIter, hipStream_t* streams, int blocks,
-                             int threads_per_block, int kernelCnt);
-
-  hipStream_t streams[2];
-
-  private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
-
-  unsigned int numKernels;
-  unsigned int numStreams;
-
-  std::map<std::string, std::vector<double>> results;
-  unsigned int width_;
-  unsigned int bufSize;
-  unsigned int maxIter;
-  unsigned int coordIdx;
-  volatile unsigned long long totalIters = 0;
-  int numCUs;
-  static const unsigned int numLoops = 10;
-};
-
-
-hipPerfMandelBrot::hipPerfMandelBrot() {}
-
-hipPerfMandelBrot::~hipPerfMandelBrot() {}
-
-void hipPerfMandelBrot::open(int deviceId) {
-
-
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  if (nGpu < 1) {
-  std::cout << "info: didn't find any GPU! skipping the test!\n";
-  passed();
-  return;
-  }
-
-
-  HIPCHECK(hipSetDevice(deviceId));
-  hipDeviceProp_t props = {0};
-  HIPCHECK(hipGetDeviceProperties(&props, deviceId));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-    << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
-    << std::endl;
-
-  numCUs = props.multiProcessorCount;
-}
-
-
-void hipPerfMandelBrot::printResults() {
-
-  int numkernels = getNumKernels();
-  int numStreams = getNumStreams();
-
-  std::cout << "\n" <<"Measured perf for kernels in GFLOPS on "
-            << numStreams << " streams (s)" <<  std::endl;
-
-  std::map<std::string, std::vector<double>>:: iterator itr;
-  for (itr = results.begin(); itr != results.end(); itr++) {
-          std::cout << "\n" << std::setw(20) << itr->first << " ";
-          for(auto i : results[itr->first]) {
-            std::cout << std::setw(10) << i << " ";
-            }
-     }
-  results.clear();
-
-  std::cout << std::endl;
-}
-
-
-// Wrappers for the kernel launches
-void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos,  float yPos, float xStep,
-                                   float yStep, uint maxIter, hipStream_t* streams,
-                                   int blocks, int threads_per_block, int kernelCnt) {
-
-  int streamCnt = getNumStreams();
-  hipLaunchKernelGGL(float_mad_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
-                      streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep,
-                      maxIter);
-
-
-}
-
-
-void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
-                             float xStep, float yStep, uint maxIter, hipStream_t * streams,
-                             int blocks, int threads_per_block, int kernelCnt) {
-
-  int streamCnt = getNumStreams();
-  hipLaunchKernelGGL(float_mandel_unroll_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
-                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
-
-}
-
-
-void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos,  float yPos,
-                               float xStep, float yStep, uint maxIter, hipStream_t * streams,
-                               int blocks, int threads_per_block, int kernelCnt) {
-
-  int streamCnt = getNumStreams();
-  hipLaunchKernelGGL(double_mad_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
-                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
-
-}
-
-
-void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
-                              float xStep, float yStep, uint maxIter, hipStream_t * streams,
-                              int blocks, int threads_per_block, int kernelCnt) {
-
-  int streamCnt = getNumStreams();
-  hipLaunchKernelGGL(float_mandel_unroll_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
-                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
-
-}
-
-
-void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
-
-  unsigned int numStreams = getNumStreams();
-
-  funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
-               &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
-
-  // Maximum iteration count
-  maxIter = 32768;
-
-  uint * hPtr[numKernels];
-  uint * dPtr[numKernels];
-
-  // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
-  width_ = 256;
-
-  bufSize = width_  * width_ * sizeof(uint);
-
-  // Create streams for concurrency
-  for (uint i = 0; i < numStreams; i++) {
-    HIPCHECK(hipStreamCreate(&streams[i]));
-  }
-
-
-  // Allocate memory on the host and device
-  for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
-    setData(hPtr[i], 0xdeadbeef);
-    HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
-  }
-
-
-  // Prepare kernel launch parameters
-  int threads = (bufSize/sizeof(uint));
-  int threads_per_block  = 64;
-  int blocks = (threads/threads_per_block) + (threads % threads_per_block);
-
-  float xStep = (float)(coords[coordIdx].width / (double)width_);
-  float yStep = (float)(-coords[coordIdx].width / (double)width_);
-  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
-  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
-
-  // Copy memory asynchronously and concurrently from host to device
-  for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
-  }
-
-  // Synchronize to make sure all the copies are completed
-  HIPCHECK(hipStreamSynchronize(0));
-
-  int kernelIdx;
-  if(testCase == 0 || testCase == 5 || testCase == 10) {
-    kernelIdx = 0;
-  }
-
-  else if(testCase == 1 || testCase == 6 || testCase == 11) {
-    kernelIdx = 1;
-  }
-  else if(testCase == 2 || testCase == 7 || testCase == 12) {
-    kernelIdx = 2;
-  }
-  else if(testCase == 3 || testCase == 8 || testCase == 13){
-    kernelIdx = 3;
-  }
-
-
-  double totalTime = 0.0;
-
-  for (unsigned int k = 0; k < numLoops; k++) {
-
-  coordIdx = testCase % numCoords;
-
-  if ((testCase == 0 || testCase == 1 || testCase == 2 ||
-                  testCase == 5 || testCase == 6 || testCase == 7 ||
-                  testCase == 10 || testCase == 11 || testCase == 12)) {
-  float xStep = (float)(coords[coordIdx].width / (double)width_);
-  float yStep = (float)(-coords[coordIdx].width / (double)width_);
-  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
-  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
-
-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-
-  for (uint i = 0; i < numKernels; i++) {
-    (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
-                           threads_per_block, i);
-  }
-
-
-  // Synchronize all the concurrent streams to have completed execution
-  HIPCHECK(hipStreamSynchronize(0));
-
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-  totalTime += all_kernel_time.count();
-
-  }
-
-
-  else {
-  double xStep = coords[coordIdx].width / (double)width_;
-  double yStep = -coords[coordIdx].width / (double)width_;
-  double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
-  double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
-
-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-
-  for (uint i = 0; i < numKernels; i++) {
-  (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
-                         threads_per_block, i);
-  }
-
-
-  // Synchronize all the concurrent streams to have completed execution
-  HIPCHECK(hipStreamSynchronize(0));
-
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-  totalTime += all_kernel_time.count();
-  }
-
-
-  }
-
-  // Copy data back from device to the host
-  for(uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost));
-  }
-
-
-  for(uint i = 0; i < numKernels; i++) {
-  checkData(hPtr[i]);
-
-  int j =0;
-  while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
-          j++;
-  }
-
-  if(j==30) {
-    std::cout << "Incorrect iteration count detected. ";
-  }
-
-  }
-
-
-  // Compute GFLOPS.  There are 7 FLOPs per iteration
-  double perf = ((double)(totalIters*numKernels) * 7 * (double)(1e-09)) /
-                (totalTime / (double)numLoops);
-
-
-  std::vector<std::string> kernelName = {"float", "float_unroll",
-                      "double", "double_unroll"};
-
-  // Print results except for Warm-up kernel
-  if(testCase!=100) {
-  results[kernelName[testCase % 4]].push_back(perf);
- }
-
-
-  for(uint i = 0 ; i < numStreams; i++) {
-    HIPCHECK(hipStreamDestroy(streams[i]));
-  }
-
-
-  // Free host and device memory
-  for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipFree(hPtr[i]));
-    HIPCHECK(hipFree(dPtr[i]));
-  }
-
-
-}
-
-
-void hipPerfMandelBrot::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
-  for (unsigned int i = 0; i < width_ * width_; i++) {
-      ptr2[i] = value;
-  }
-}
-
-
-void hipPerfMandelBrot::checkData(uint *ptr) {
-  totalIters = 0;
-  for (unsigned int i = 0; i < width_ * width_; i++) {
-    totalIters += ptr[i];
-  }
-}
-
-
-int main(int argc, char* argv[]) {
-  hipPerfMandelBrot mandelbrotCompute;
-  int deviceId = 0;
-
-  mandelbrotCompute.open(deviceId);
-
-  for (unsigned int testCase = 0; testCase < 3; testCase++) {
-
-
-  switch (testCase) {
-
-
-  case 0: {
-    // Warmup-kernel - default stream executes serially
-    mandelbrotCompute.setNumStreams(1);
-    mandelbrotCompute.setNumKernels(1);
-    mandelbrotCompute.run(100/*Random number*/, deviceId);
-    break;
-    }
-
-
-  case 1: {
-    // run all - sync
-    int i = 0;
-    do {
-    mandelbrotCompute.setNumStreams(1);
-    mandelbrotCompute.setNumKernels(1);
-    mandelbrotCompute.run(i, deviceId);
-    i++;
-    }while(i < 12);
-    mandelbrotCompute.printResults();
-
-    break;
-  }
-
-
-  case 2: {
-    // run all - async
-    int i = 0;
-    do {
-    mandelbrotCompute.setNumStreams(2);
-    mandelbrotCompute.setNumKernels(2);
-    mandelbrotCompute.run(i, deviceId);
-    i++;
-    }while(i < 12);
-    mandelbrotCompute.printResults();
-
-    break;
-
-  }
-
-
-  default: {
-    break;
-  }
-
-
-  }
-
-
-
-  }
-
-
-  passed();
-}
@@ -1,289 +0,0 @@
-/*
- Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-
-typedef struct {
-  double x;
-  double y;
-  double width;
-} coordRec;
-
-static coordRec coords[] = {
-    {0.0, 0.0, 0.00001},         // All black
-};
-
-static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
-
-__global__ void mandelbrot(uint *out, uint width, float xPos,  float yPos, float xStep,
-                            float yStep, uint maxIter) {
-
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % width;
-  int j = tid / width;
-  float x0 = (float)(xPos + xStep*i);
-  float y0 = (float)(yPos + yStep*j);
-
-  float x = x0;
-  float y = y0;
-
-  uint iter = 0;
-  float tmp;
-  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
-    tmp = x;
-    x = fma(-y,y,fma(x,x,x0));
-    y = fma(2.0f*tmp,y,y0);
-  }
-
-  out[tid] = iter;
-};
-
-class hipPerfDeviceConcurrency {
-  public:
-  hipPerfDeviceConcurrency();
-  ~hipPerfDeviceConcurrency();
-
-  void setNumGpus(unsigned int num) {
-    numDevices = num;
-  }
-  unsigned int getNumGpus() {
-    return numDevices;
-  }
-
-  void open(void);
-  void close(void);
-  void run(unsigned int testCase, int numGpus);
-
-  private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
-
-  unsigned int numDevices;
-  unsigned int width_;
-  unsigned int bufSize;
-  unsigned int coordIdx;
-  unsigned long long totalIters = 0;
-};
-
-
-hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {}
-
-hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {}
-
-void hipPerfDeviceConcurrency::open(void) {
-
-
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  setNumGpus(nGpu);
-  if (nGpu < 1) {
-  std::cout << "info: didn't find any GPU! skipping the test!\n";
-  passed();
-  }
-
-
-}
-
-
-void hipPerfDeviceConcurrency::close() {
-}
-
-void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
-
-
-  static int deviceId;
-  uint * hPtr[numGpus];
-  uint * dPtr[numGpus];
-  hipStream_t streams[numGpus];
-  int numCUs[numGpus];
-  unsigned int maxIter[numGpus];
-  unsigned long long expectedIters[numGpus];
-
-  int threads, threads_per_block, blocks;
-  float xStep, yStep, xPos, yPos;
-
-  for(int i = 0; i < numGpus; i++) {
-
-  if(testCase != 0) {
-    deviceId = i;
-  }
-
-  HIPCHECK(hipSetDevice(deviceId));
-
-  hipDeviceProp_t props = {0};
-  HIPCHECK(hipGetDeviceProperties(&props, i));
-
-  if (testCase != 0) {
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-            << " with " << props.multiProcessorCount << " CUs" << " and device ID: "
-            << i << std::endl;
-  }
-
-  numCUs[i] = props.multiProcessorCount;
-  int clkFrequency = 0;
-  HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i));
-
-  clkFrequency =(unsigned int)clkFrequency/1000;
-
-  // Maximum iteration count
-  // maxIter = 8388608 * (engine_clock / 1000).serial execution
-  maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128);
-  maxIter[i] = (maxIter[i] + 15) & ~15;
-
-  // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
-  width_ = 256;
-
-  bufSize = width_ * width_ * sizeof(uint);
-
-  // Create streams for concurrency
-  HIPCHECK(hipStreamCreate(&streams[i]));
-
-  // Allocate memory on the host and device
-  HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
-  setData(hPtr[i], 0xdeadbeef);
-  HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
-
-  // Prepare kernel launch parameters
-  threads = (bufSize/sizeof(uint));
-  threads_per_block  = 64;
-  blocks = (threads/threads_per_block) + (threads % threads_per_block);
-
-  coordIdx = testCase % numCoords;
-  xStep = (float)(coords[coordIdx].width / (double)width_);
-  yStep = (float)(-coords[coordIdx].width / (double)width_);
-  xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
-  yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
-
-  // Copy memory from host to device
-  HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
-
-  }
-
-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-
-  for(int i = 0; i < numGpus; i++) {
-
-  if(testCase != 0) {
-    deviceId = i;
-  }
-
-  HIPCHECK(hipSetDevice(deviceId));
-
-  hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i],
-                      dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]);
-
-  }
-
-  for(int i = 0; i < numGpus; i++) {
-    HIPCHECK(hipStreamSynchronize(0));
-  }
-
-
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-
-  for(int i = 0; i < numGpus; i++) {
-
-  if(testCase != 0) {
-    deviceId = i;
-  }
-  HIPCHECK(hipSetDevice(deviceId));
-
-  // Copy data back from device to the host
-  HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
-
-  checkData(hPtr[i]);
-  expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i];
-
-  if (testCase != 0) {
-    checkData(hPtr[i]);
-    if(totalIters != expectedIters[i]) {
-      std::cout << "Incorrect iteration count detected" << std::endl;
-    }
-  }
-
-
-  HIPCHECK(hipStreamDestroy(streams[i]));
-
-  // Free host and device memory
-  HIPCHECK(hipFree(hPtr[i]));
-  HIPCHECK(hipFree(dPtr[i]));
-  }
-
-  if (testCase != 0) {
-  std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): "
-            << all_kernel_time.count() << " (s) " << '\n' << std::endl;
-  }
-
-  if(testCase == 0) {
-    deviceId++;
-  }
-
-
-}
-
-
-void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
-  for (unsigned int i = 0; i < width_ * width_ ; i++) {
-      ptr2[i] = value;
-  }
-}
-
-
-void hipPerfDeviceConcurrency::checkData(uint *ptr) {
-  totalIters = 0;
-  for (unsigned int i = 0; i < width_ * width_; i++) {
-    totalIters += ptr[i];
-  }
-}
-
-
-int main(int argc, char* argv[]) {
-  hipPerfDeviceConcurrency deviceConcurrency;
-
-  deviceConcurrency.open();
-
-  int nGpu = deviceConcurrency.getNumGpus();
-
-  // testCase = 0 refers to warmup kernel run
-  int testCase = 0;
-
-  for (int i = 0; i < nGpu; i++) {
-    // Warm-up kernel on all devices
-    deviceConcurrency.run(testCase, 1);
-  }
-
-  // Time for kernel on 1 device
-  deviceConcurrency.run(++testCase, 1);
-
-  // Time for kernel on all available devices
-  deviceConcurrency.run(++testCase, nGpu);
-
-  passed();
-}
@@ -1,20 +0,0 @@
-project(bit_extract)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-
-# Create the excutable
-add_executable(bit_extract bit_extract.cpp)
-
-# Link with HIP
-target_link_libraries(bit_extract hip::host)
@@ -9,15 +9,19 @@ HIPCC=$(HIP_PATH)/bin/hipcc

 # Show how to use PLATFORM to specify different options for each compiler:
 ifeq (${HIP_PLATFORM}, nvcc)
-	HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20
+	HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 
 endif

 EXE=bit_extract
+EXE_STATIC=bit_extract_static

 $(EXE): bit_extract.cpp
 	$(HIPCC) $(HIPCC_FLAGS) $< -o $@

-all: $(EXE)
+$(EXE_STATIC): bit_extract.cpp
+	$(HIPCC) -use-staticlib  $(HIPCC_FLAGS) $< -o $@
+
+all: $(EXE) $(EXE_STATIC)

 clean:
-	rm -f *.o $(EXE)
+	rm -f *.o $(EXE) $(EXE_STATIC)
@@ -1,36 +0,0 @@
-project(module_api)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-
-# Create the excutable
-add_executable(runKernel.hip.out runKernel.cpp)
-add_executable(launchKernelHcc.hip.out launchKernelHcc.cpp)
-add_executable(defaultDriver.hip.out defaultDriver.cpp)
-
-# Generate code object
-add_custom_target(
-  codeobj
-  ALL
-  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../vcpy_kernel.cpp -o vcpy_kernel.code
-  COMMENT "codeobj generated"
-)
-
-add_dependencies(runKernel.hip.out codeobj)
-add_dependencies(launchKernelHcc.hip.out codeobj)
-add_dependencies(defaultDriver.hip.out codeobj)
-
-# Link with HIP
-target_link_libraries(runKernel.hip.out hip::host)
-target_link_libraries(launchKernelHcc.hip.out hip::host)
-target_link_libraries(defaultDriver.hip.out hip::host)
@@ -1,30 +0,0 @@
-project(modile_api_global)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-
-# Create the excutable
-add_executable(runKernel.hip.out runKernel.cpp)
-
-# Generate code object
-add_custom_target(
-  codeobj
-  ALL
-  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../vcpy_kernel.cpp -o vcpy_kernel.code
-  COMMENT "codeobj generated"
-)
-
-add_dependencies(runKernel.hip.out codeobj)
-
-# Link with HIP
-target_link_libraries(runKernel.hip.out hip::host)
@@ -1,21 +0,0 @@
-#Follow "README.md" to generate square.cpp if it's missing
-
-project(square)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-
-# Create the excutable
-add_executable(square square.cpp)
-
-# Link with HIP
-target_link_libraries(square hip::host)
@@ -11,7 +11,7 @@ else
 	SOURCES=square.cpp
 endif

-all: square.out
+all: square.out square.out.static

 # Step
 square.cpp: square.cu
@@ -20,5 +20,8 @@ square.cpp: square.cu
 square.out: $(SOURCES)
 	$(HIPCC) $(CXXFLAGS) $(SOURCES) -o $@

+square.out.static: $(SOURCES)
+	$(HIPCC) -use-staticlib $(CXXFLAGS) $(SOURCES) -o $@
+
 clean:
-	rm -f *.o *.out square.cpp
+	rm -f *.o *.out *.out.static square.cpp
@@ -1,39 +1,13 @@
 # Square.md

-Simple test which shows how to use hipify-perl to port CUDA code to HIP.
-See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example.
+Simple test which shows how to use hipify-perl to port CUDA code to HIP.  
+See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example. 
 Now it is even simpler and requires no manual modification to the hipified source code - just hipify and compile:

- Add hip/bin path to the PATH
+1. Add hip/bin path to the PATH  :
+    <code>export PATH=$PATH:[MYHIP]/bin</code>

-```
-$ export PATH=$PATH:[MYHIP]/bin
-```
-
- Define environment variable
-
-```
-$ export HIP_PATH=[MYHIP]
-```
-
- Build executible file
-
-```
-$ cd ~/hip/samples/0_Intro/square
-$ make
-/home/user/hip/bin/hipify-perl square.cu > square.cpp
-/home/user/hip/bin/hipcc  square.cpp -o square.out
-/home/user/hip/bin/hipcc -use-staticlib  square.cpp -o square.out.static
-```
- Execute file
-```
-$ ./square.out
-info: running on device Navi 14 [Radeon Pro W5500]
-info: allocate host mem (  7.63 MB)
-info: allocate device mem (  7.63 MB)
-info: copy Host2Device
-info: launch 'vector_square' kernel
-info: copy Device2Host
-info: check result
-PASSED!
-```
+2. <code>$ make </code>
+   Make runs these steps.  This can be performed on either CUDA or AMD platform:
+   <code>hipify-perl square.cu > square.cpp </code>    # convert cuda code to hip code
+   <code>hipcc square.cpp</code>                       # compile into executable
@@ -1,20 +0,0 @@
-project(hipBusBandwidth)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(hipBusBandwidth hipBusBandwidth.cpp ResultDatabase.cpp)
-
-# Link with HIP
-target_link_libraries(hipBusBandwidth hip::host)
@@ -12,7 +12,7 @@ enum MallocMode { MallocPinned, MallocUnpinned, MallocRegistered };
 bool p_verbose = false;
 MallocMode p_malloc_mode = MallocPinned;
 int p_numa_ctl = -1;
-int p_iterations = 0;
+int p_iterations = 10;
 int p_beatsperiteration = 1;
 int p_device = 0;
 int p_detailed = 0;
@@ -89,9 +89,7 @@ hipError_t memcopy(void* dst, const void* src, size_t sizeBytes, enum hipMemcpyK
 int sizes[] = {-64, -256, -512, 1,    2,    4,     8,     16,    32,     64,     128,   256,
               512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288};
 int nSizes = sizeof(sizes) / sizeof(int);
-// iterations to be run for the corresponding sizes, less number as the size increases
-int iterations[] = {1000, 1000, 1000, 1000, 500, 500, 500, 500, 500, 200, 200, 200,
-               200, 200, 100, 100, 100, 100, 50, 50, 50, 20, 20};
+

 // ****************************************************************************
 // Function: RunBenchmark_H2D
@@ -176,48 +174,53 @@ void RunBenchmark_H2D(ResultDatabase& resultDB) {
    hipEventCreate(&stop);
    CHECK_HIP_ERROR();

-    // store the times temporarily to estimate latency
-    // float times[nSizes];
-    for (int i = 0; i < nSizes; i++) {
-      int sizeIndex, iterIndex;
-      sizeIndex = i;
-      iterIndex = i;
+    // Three passes, forward and backward both
+    for (int pass = 0; pass < p_iterations; pass++) {
+        // store the times temporarily to estimate latency
+        // float times[nSizes];
+        // Step through sizes forward on even passes and backward on odd
+        for (int i = 0; i < nSizes; i++) {
+            int sizeIndex;
+            if ((pass % 2) == 0)
+                sizeIndex = i;
+            else
+                sizeIndex = (nSizes - 1) - i;

-      const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-      const int nbytes = sizeToBytes(thisSize);
-      const int niter = p_iterations ? p_iterations : iterations[iterIndex];
-      for (int pass = 0; pass < niter; pass++) {
+            const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+            const int nbytes = sizeToBytes(thisSize);

-        hipEventRecord(start, 0);
-        for (int j = 0; j < p_beatsperiteration; j++) {
-          memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice);
+            hipEventRecord(start, 0);
+            for (int j = 0; j < p_beatsperiteration; j++) {
+                memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice);
+            }
+            hipEventRecord(stop, 0);
+            hipEventSynchronize(stop);
+            float t = 0;
+            hipEventElapsedTime(&t, start, stop);
+            // times[sizeIndex] = t;
+
+            // Convert to GB/sec
+            if (p_verbose) {
+                std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+            }
+
+            double speed =
+                (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) /  1000) / t;
+            char sizeStr[256];
+            if (p_beatsperiteration > 1) {
+                sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
+            } else {
+                sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+            }
+            resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
+                               sizeStr, "GB/sec", speed);
+            resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr,
+                               "ms", t);
+
+            if (p_onesize) {
+                break;
+            }
        }
-        hipEventRecord(stop, 0);
-        hipEventSynchronize(stop);
-        float t = 0;
-        hipEventElapsedTime(&t, start, stop);
-        // times[sizeIndex] = t;
-        // Convert to GB/sec
-        if (p_verbose) {
-          std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
-        }
-
-        double speed =
-            (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) /  1000) / t;
-        char sizeStr[256];
-        if (p_beatsperiteration > 1) {
-          sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
-        } else {
-          sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-        }
-        resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
-                           sizeStr, "GB/sec", speed);
-        resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr,                                            "ms", t);
-
-      }
-      if (p_onesize) {
-        break;
-      }
    }

    if (p_onesize) {
@@ -344,50 +347,53 @@ void RunBenchmark_D2H(ResultDatabase& resultDB) {
    hipEventCreate(&stop);
    CHECK_HIP_ERROR();

-    // store the times temporarily to estimate latency
-    // float times[nSizes];
-    for (int i = 0; i < nSizes; i++) {
-      int sizeIndex, iterIndex;
-      sizeIndex = i;
-      iterIndex = i;
+    // Three passes, forward and backward both
+    for (int pass = 0; pass < p_iterations; pass++) {
+        // store the times temporarily to estimate latency
+        // float times[nSizes];
+        // Step through sizes forward on even passes and backward on odd
+        for (int i = 0; i < nSizes; i++) {
+            int sizeIndex;
+            if ((pass % 2) == 0)
+                sizeIndex = i;
+            else
+                sizeIndex = (nSizes - 1) - i;

-      const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-      const int nbytes = sizeToBytes(thisSize);
-      const int niter = p_iterations ? p_iterations : iterations[iterIndex];
-      for (int pass = 0; pass < niter; pass++) {
+            const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+            const int nbytes = sizeToBytes(thisSize);

-        hipEventRecord(start, 0);
-        for (int j = 0; j < p_beatsperiteration; j++) {
-          memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost);
+            hipEventRecord(start, 0);
+            for (int j = 0; j < p_beatsperiteration; j++) {
+                memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost);
+            }
+            hipEventRecord(stop, 0);
+            hipEventSynchronize(stop);
+            float t = 0;
+            hipEventElapsedTime(&t, start, stop);
+            // times[sizeIndex] = t;
+
+            // Convert to GB/sec
+            if (p_verbose) {
+                std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+            }
+
+            double speed =
+                (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
+            char sizeStr[256];
+            sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+            if (p_beatsperiteration > 1) {
+                sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
+            } else {
+                sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+            }
+            resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
+                               sizeStr, "GB/sec", speed);
+            resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode),
+                               sizeStr, "ms", t);
+            if (p_onesize) {
+                break;
+            }
        }
-        hipEventRecord(stop, 0);
-        hipEventSynchronize(stop);
-        float t = 0;
-        hipEventElapsedTime(&t, start, stop);
-        // times[sizeIndex] = t;
-        // Convert to GB/sec
-        if (p_verbose) {
-          std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
-        }
-
-        double speed =
-            (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
-        char sizeStr[256];
-        sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-        if (p_beatsperiteration > 1) {
-          sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
-        } else {
-          sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-        }
-        resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
-                           sizeStr, "GB/sec", speed);
-        resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode),
-                           sizeStr, "ms", t);
-
-      }
-      if (p_onesize) {
-        break;
-      }
    }

    if (p_onesize) {
@@ -516,43 +522,43 @@ void RunBenchmark_Bidir(ResultDatabase& resultDB) {
    hipStreamCreate(&stream[0]);
    hipStreamCreate(&stream[1]);

-    // store the times temporarily to estimate latency
-    // float times[nSizes];
-    for (int i = 0; i < nSizes; i++) {
-      int sizeIndex, iterIndex;
-      sizeIndex = i;
-      iterIndex = i;
+    // Three passes, forward and backward both
+    for (int pass = 0; pass < p_iterations; pass++) {
+        // store the times temporarily to estimate latency
+        // float times[nSizes];
+        // Step through sizes forward on even passes and backward on odd
+        for (int i = 0; i < nSizes; i++) {
+            int sizeIndex;
+            if ((pass % 2) == 0)
+                sizeIndex = i;
+            else
+                sizeIndex = (nSizes - 1) - i;

-      const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-      const int nbytes = sizeToBytes(thisSize);
-      const int niter = p_iterations ? p_iterations : iterations[iterIndex];
-      for (int pass = 0; pass < niter; pass++) {
+            const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+            const int nbytes = sizeToBytes(thisSize);

-        hipEventRecord(start, 0);
-        hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]);
-        hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]);
-        hipEventRecord(stop, 0);
-        hipEventSynchronize(stop);
-        float t = 0;
-        hipEventElapsedTime(&t, start, stop);
+            hipEventRecord(start, 0);
+            hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]);
+            hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]);
+            hipEventRecord(stop, 0);
+            hipEventSynchronize(stop);
+            float t = 0;
+            hipEventElapsedTime(&t, start, stop);

-        // Convert to GB/sec
-        if (p_verbose) {
-          std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+            // Convert to GB/sec
+            if (p_verbose) {
+                std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+            }
+
+            double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t;
+            char sizeStr[256];
+            sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+            resultDB.AddResult(
+                std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr,
+                "GB/sec", speed);
+            resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode),
+                               sizeStr, "ms", t);
        }
-
-        double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t;
-        char sizeStr[256];
-        sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-        resultDB.AddResult(
-            std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr,
-            "GB/sec", speed);
-        resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode),
-                           sizeStr, "ms", t);
-      }
-      if (p_onesize) {
-        break;
-      }
    }

    // Cleanup
@@ -702,63 +708,66 @@ void RunBenchmark_P2P_Unidir(ResultDatabase& resultDB) {
            hipEventCreate(&stop);
            CHECK_HIP_ERROR();

-            // store the times temporarily to estimate latency
-            // float times[nSizes];
-            for (int i = 0; i < nSizes; i++) {
-              int sizeIndex, iterIndex;
-              sizeIndex = i;
-              iterIndex = i;
+            // Three passes, forward and backward both
+            for (int pass = 0; pass < p_iterations; pass++) {
+                // store the times temporarily to estimate latency
+                // float times[nSizes];
+                // Step through sizes forward on even passes and backward on odd
+                for (int i = 0; i < nSizes; i++) {
+                    int sizeIndex;
+                    if ((pass % 2) == 0)
+                        sizeIndex = i;
+                    else
+                        sizeIndex = (nSizes - 1) - i;

-              const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-              const int nbytes = sizeToBytes(thisSize);
-              const int niter = p_iterations ? p_iterations : iterations[iterIndex];
-              for (int pass = 0; pass < niter; pass++) {
+                    const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+                    const int nbytes = sizeToBytes(thisSize);

-                hipDeviceSynchronize();
+                    hipDeviceSynchronize();

-                hipEventRecord(start, 0);
+                    hipEventRecord(start, 0);

-                for (int j = 0; j < p_beatsperiteration; j++) {
-                  hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice);
-                }
+                    for (int j = 0; j < p_beatsperiteration; j++) {
+                        hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice);
+                    }

-                hipEventRecord(stop, 0);
+                    hipEventRecord(stop, 0);

-                hipEventSynchronize(stop);
+                    hipEventSynchronize(stop);

-                float t = 0;
-                hipEventElapsedTime(&t, start, stop);
-                // times[sizeIndex] = t;
+                    float t = 0;
+                    hipEventElapsedTime(&t, start, stop);
+                    // times[sizeIndex] = t;

-                // Convert to GB/sec
-                if (p_verbose) {
-                  std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
-                }
+                    // Convert to GB/sec
+                    if (p_verbose) {
+                        std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+                    }

-                double speed =
-                    (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
-                char sizeStr[256];
-                if (p_beatsperiteration > 1) {
-                  sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
-                          p_beatsperiteration);
-                } else {
-                  sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-                }
+                    double speed =
+                        (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
+                    char sizeStr[256];
+                    if (p_beatsperiteration > 1) {
+                        sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
+                                p_beatsperiteration);
+                    } else {
+                        sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+                    }

-                string cGpu, pGpu;
-                cGpu = gpuIDToString(currentGpu);
-                pGpu = gpuIDToString(peerGpu);
+                    string cGpu, pGpu;
+                    cGpu = gpuIDToString(currentGpu);
+                    pGpu = gpuIDToString(peerGpu);

-                resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) +
-                                       "_gpu" + std::string(pGpu),
+                    resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) +
+                                           "_gpu" + std::string(pGpu),
                                       sizeStr, "GB/sec", speed);
-                resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) +
-                                       "_gpu" + std::string(pGpu),
+                    resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) +
+                                           "_gpu" + std::string(pGpu),
                                       sizeStr, "ms", t);

-              }
-                if (p_onesize) {
-                  break;
+                    if (p_onesize) {
+                        break;
+                    }
                }
            }

@@ -820,68 +829,71 @@ void RunBenchmark_P2P_Bidir(ResultDatabase& resultDB) {
            hipStreamCreate(&stream[0]);
            hipStreamCreate(&stream[1]);

-            // store the times temporarily to estimate latency
-            // float times[nSizes];
-            for (int i = 0; i < nSizes; i++) {
-              int sizeIndex, iterIndex;
-              sizeIndex = i;
-              iterIndex = i;
+            // Three passes, forward and backward both
+            for (int pass = 0; pass < p_iterations; pass++) {
+                // store the times temporarily to estimate latency
+                // float times[nSizes];
+                // Step through sizes forward on even passes and backward on odd
+                for (int i = 0; i < nSizes; i++) {
+                    int sizeIndex;
+                    if ((pass % 2) == 0)
+                        sizeIndex = i;
+                    else
+                        sizeIndex = (nSizes - 1) - i;

-              const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-              const int nbytes = sizeToBytes(thisSize);
-              const int niter = p_iterations ? p_iterations : iterations[iterIndex];
-              for (int pass = 0; pass < niter; pass++) {
+                    const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+                    const int nbytes = sizeToBytes(thisSize);

-                hipDeviceSynchronize();
+                    hipDeviceSynchronize();

-                hipEventRecord(start, 0);
+                    hipEventRecord(start, 0);

-                for (int j = 0; j < p_beatsperiteration; j++) {
-                  hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes,
-                                 hipMemcpyDeviceToDevice, stream[0]);
-                  hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes,
-                                 hipMemcpyDeviceToDevice, stream[1]);
+                    for (int j = 0; j < p_beatsperiteration; j++) {
+                        hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes,
+                                       hipMemcpyDeviceToDevice, stream[0]);
+                        hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes,
+                                       hipMemcpyDeviceToDevice, stream[1]);
+                    }
+
+                    hipEventRecord(stop, 0);
+
+                    hipEventSynchronize(stop);
+
+                    float t = 0;
+                    hipEventElapsedTime(&t, start, stop);
+                    // times[sizeIndex] = t;
+
+                    // Convert to GB/sec
+                    if (p_verbose) {
+                        std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+                    }
+
+                    double speed =
+                        (double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) /
+                        t;
+                    char sizeStr[256];
+                    if (p_beatsperiteration > 1) {
+                        sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
+                                p_beatsperiteration);
+                    } else {
+                        sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+                    }
+
+                    string cGpu, pGpu;
+                    cGpu = gpuIDToString(currentGpu);
+                    pGpu = gpuIDToString(peerGpu);
+
+                    resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
+                                           std::string(pGpu),
+                                       sizeStr, "GB/sec", speed);
+                    resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
+                                           std::string(pGpu),
+                                       sizeStr, "ms", t);
+
+                    if (p_onesize) {
+                        break;
+                    }
                }
-
-                hipEventRecord(stop, 0);
-
-                hipEventSynchronize(stop);
-
-                float t = 0;
-                hipEventElapsedTime(&t, start, stop);
-                // times[sizeIndex] = t;
-
-                // Convert to GB/sec
-                if (p_verbose) {
-                  std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
-                }
-
-                double speed =
-                    (double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) /
-                    t;
-                char sizeStr[256];
-                if (p_beatsperiteration > 1) {
-                  sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
-                          p_beatsperiteration);
-                } else {
-                  sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-                }
-
-                string cGpu, pGpu;
-                cGpu = gpuIDToString(currentGpu);
-                pGpu = gpuIDToString(peerGpu);
-
-                resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
-                                       std::string(pGpu),
-                                   sizeStr, "GB/sec", speed);
-                resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
-                                       std::string(pGpu),
-                                   sizeStr, "ms", t);
-
-              }
-              if (p_onesize) {
-                break;
-              }
            }

            if (p_onesize) {
@@ -1,31 +0,0 @@
-project(hipCommander)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(hipCommander hipCommander.cpp)
-
-# Generate code object
-add_custom_target(
-  codeobj
-  ALL
-  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../nullkernel.hip.cpp -o nullkernel.hsaco
-  COMMENT "codeobj generated"
-)
-
-add_dependencies(hipCommander codeobj)
-
-# Link with HIP
-target_link_libraries(hipCommander hip::host)
-set_property(TARGET hipCommander PROPERTY CXX_STANDARD 11)
@@ -1,35 +0,0 @@
-project(hipDispatchLatency)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(hipDispatchLatency hipDispatchLatency.cpp)
-add_executable(hipDispatchEnqueueRateMT hipDispatchEnqueueRateMT.cpp)
-
-# Generate code object
-add_custom_target(
-  codeobj
-  ALL
-  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../test_kernel.cpp -o test_kernel.code
-  COMMENT "codeobj generated"
-)
-
-add_dependencies(hipDispatchLatency codeobj)
-add_dependencies(hipDispatchEnqueueRateMT codeobj)
-
-# Link with HIP
-target_link_libraries(hipDispatchLatency hip::host)
-target_link_libraries(hipDispatchEnqueueRateMT hip::host)
-set_property(TARGET hipDispatchLatency PROPERTY CXX_STANDARD 11)
-set_property(TARGET hipDispatchEnqueueRateMT PROPERTY CXX_STANDARD 11)
@@ -1,20 +0,0 @@
-project(hipInfo)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(hipInfo hipInfo.cpp)
-
-# Link with HIP
-target_link_libraries(hipInfo hip::host)
@@ -1,20 +0,0 @@
-project(MatrixTranspose)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(MatrixTranspose MatrixTranspose.cpp)
-
-# Link with HIP
-target_link_libraries(MatrixTranspose hip::host)
@@ -1,20 +0,0 @@
-project(inline_asm)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(inline_asm inline_asm.cpp)
-
-# Link with HIP
-target_link_libraries(inline_asm hip::host)
@@ -1,30 +0,0 @@
-project(texture2dDrv)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(texture2dDrv texture2dDrv.cpp)
-
-# Generate code object
-add_custom_target(
-  codeobj
-  ALL
-  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../tex2dKernel.cpp -o tex2dKernel.code
-  COMMENT "codeobj generated"
-)
-
-add_dependencies(texture2dDrv codeobj)
-
-# Link with HIP
-target_link_libraries(texture2dDrv hip::host)
@@ -1,20 +0,0 @@
-project(occupancy)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(occupancy occupancy.cpp)
-
-# Link with HIP
-target_link_libraries(occupancy hip::host)
@@ -1,20 +0,0 @@
-project(hipEvent)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(hipEvent hipEvent.cpp)
-
-# Link with HIP
-target_link_libraries(hipEvent hip::host)
@@ -1,20 +0,0 @@
-project(sharedMemory)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(sharedMemory sharedMemory.cpp)
-
-# Link with HIP
-target_link_libraries(sharedMemory hip::host)
@@ -1,20 +0,0 @@
-project(shfl)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(shfl shfl.cpp)
-
-# Link with HIP
-target_link_libraries(shfl hip::host)
@@ -1,19 +0,0 @@
-project(2dshfl)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-
-# Create the excutable
-add_executable(2dshfl 2dshfl.cpp)
-
-# Link with HIP
-target_link_libraries(2dshfl hip::host)
@@ -1,19 +0,0 @@
-project(dynamic_shared)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-
-# Create the excutable
-add_executable(dynamic_shared dynamic_shared.cpp)
-
-# Link with HIP
-target_link_libraries(dynamic_shared hip::host)
@@ -1,19 +0,0 @@
-project(stream)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-
-# Create the excutable
-add_executable(stream stream.cpp)
-
-# Link with HIP
-target_link_libraries(stream hip::host)
@@ -1,19 +0,0 @@
-project(peer2peer)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-
-# Create the excutable
-add_executable(peer2peer peer2peer.cpp)
-
-# Link with HIP
-target_link_libraries(peer2peer hip::host)
@@ -1,19 +0,0 @@
-project(unroll)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-
-# Create the excutable
-add_executable(unroll unroll.cpp)
-
-# Link with HIP
-target_link_libraries(unroll hip::host)
@@ -1,27 +0,0 @@
-Build procedure
-
-We provide Makefile and CMakeLists.txt to build the samples seperately.
-
-1.Makefile supports shared lib of hip-rocclr runtime and nvcc.
-
-To build a sample, just type in sample folder,
-
-make
-
-
-
-2.CMakeLists.txt can support shared and static libs of hip-rocclr runtime.
-
-To build a sample, type in sample folder,
-
-mkdir build (if build folder is missing)
-
-cd build
-
-cmake ..
-
-make
-
-If you want debug version, follow,
-
-cmake -DCMAKE_BUILD_TYPE=Debug ..