SWDEV-286322 - Fix EOL in files

Change-Id: I1746e80aceeeaa4fc5df65c858f4816d99010186
2021-05-19 14:48:53 -04:00
@@ -1,308 +1,308 @@
-/*
- Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-#include <hip/hip_vector_types.h>
-#include <vector>
-
-using namespace std;
-
-#define NUM_TYPES 3
-vector<string> types= {"float", "float2", "float4"};
-vector<unsigned int> typeSizes = {4, 8, 16};
-
-#define NUM_SIZES 12
-vector<unsigned int> sizes = {1,  2,   4,   8,   16,   32,
-                              64, 128, 256, 512, 1024, 2048};
-
-#define NUM_BUFS 6
-#define MAX_BUFS (1 << (NUM_BUFS - 1))
-
-template <typename T>
-__global__ void sampleRate(T * outBuffer, unsigned int inBufSize, unsigned int writeIt,
-                           T **inBuffer, int numBufs) {
-
-  uint gid = (blockIdx.x * blockDim.x + threadIdx.x);
-  uint inputIdx = gid % inBufSize;
-
-  T tmp = (T)0.0f;
-  for(int i = 0; i < numBufs; i++) {
-    tmp += *(*(inBuffer+i)+inputIdx);
-  }
-
-  if (writeIt*(unsigned int)tmp.x) {
-    outBuffer[gid] = tmp;
-   }
-};
-
-template <typename T>
-__global__ void sampleRateFloat(T * outBuffer, unsigned int inBufSize, unsigned int writeIt,
-                                T ** inBuffer, int numBufs) {
-
-  uint gid = (blockIdx.x * blockDim.x + threadIdx.x);
-  uint inputIdx = gid % inBufSize;
-
-  T tmp = (T)0.0f;
-
-  for(int i = 0; i < numBufs; i++) {
-    tmp += *((*inBuffer+i)+inputIdx);
-  }
-
-  if (writeIt*(unsigned int)tmp) {
-    outBuffer[gid] = tmp;
-  }
-};
-
-class hipPerfSampleRate {
-  public:
-  hipPerfSampleRate();
-  ~hipPerfSampleRate();
-
-  void open(void);
-  void run(unsigned int testCase);
-  void close(void);
-
-  // array of funtion pointers
-  typedef void (hipPerfSampleRate::*funPtr)(void * outBuffer, unsigned int
-    inBufSize, unsigned int writeIt, void **inBuffer, int numBufs, int grids, int blocks,
-    int threads_per_block);
-
-  // Wrappers
-  void float_kernel(void * outBuffer, unsigned int
-    inBufSize, unsigned int writeIt, void **inBuffer, int numBufs, int grids, int blocks,
-    int threads_per_block);
-
-  void float2_kernel(void * outBuffer, unsigned int
-    inBufSize, unsigned int writeIt, void **inBuffer, int numBufs, int grids, int blocks,
-    int threads_per_block);
-
-  void float4_kernel(void * outBuffer, unsigned int
-    inBufSize, unsigned int writeIt, void **inBuffer, int numBufs, int grids, int blocks,
-    int threads_per_block);
-
-  private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
-
-  unsigned int width_;
-  unsigned int bufSize_;
-  unsigned long long totalIters = 0;
-  int numCUs;
-
-  unsigned int outBufSize_;
-  static const unsigned int MAX_ITERATIONS = 25;
-  unsigned int numBufs_;
-  unsigned int typeIdx_;
-};
-
-
-hipPerfSampleRate::hipPerfSampleRate() {}
-
-hipPerfSampleRate::~hipPerfSampleRate() {}
-
-void hipPerfSampleRate::open(void) {
-
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-   if (nGpu < 1) {
-  std::cout << "info: didn't find any GPU! skipping the test!\n";
-  passed();
-  return;
-  }
-
-  int deviceId = 0;
-  hipDeviceProp_t props = {0};
-  props = {0};
-  HIPCHECK(hipSetDevice(deviceId));
-  HIPCHECK(hipGetDeviceProperties(&props, deviceId));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-    << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
-    << std::endl;
-  numCUs = props.multiProcessorCount;
-  }
-
-
-void hipPerfSampleRate::close() {
-
-}
-
-
-// Wrappers for the kernel launches
-void hipPerfSampleRate::float_kernel(void * outBuffer, unsigned int inBufSize,
-                                       unsigned int writeIt, void **inBuffer,
-                                       int numBufs, int grids, int blocks, int threads_per_block) {
-
-  hipLaunchKernelGGL(sampleRateFloat<float>, dim3(grids, grids, grids), dim3 (blocks), 0, 0,
-                      (float*)outBuffer, inBufSize, writeIt, (float**)inBuffer, numBufs);
-
-}
-
-void hipPerfSampleRate::float2_kernel(void * outBuffer, unsigned int inBufSize,
-                                       unsigned int writeIt, void **inBuffer,
-                                       int grids, int blocks, int threads_per_block, int numBufs) {
-
-  hipLaunchKernelGGL(sampleRate<float2>, dim3(grids, grids, grids), dim3(blocks), 0, 0,
-                      (float2 *)outBuffer, inBufSize, writeIt, (float2**)inBuffer, numBufs);
-}
-
-void hipPerfSampleRate::float4_kernel(void * outBuffer, unsigned int inBufSize,
-                                       unsigned int writeIt, void **inBuffer,
-                                       int grids, int blocks, int threads_per_block, int numBufs) {
-
-  hipLaunchKernelGGL(sampleRate<float4>, dim3(grids, grids, grids), dim3(blocks), 0, 0,
-                      (float4 *) outBuffer, inBufSize, writeIt, (float4**)inBuffer, numBufs);
-}
-
-void hipPerfSampleRate::run(unsigned int test) {
-
-  funPtr p[] = {&hipPerfSampleRate::float_kernel, &hipPerfSampleRate::float2_kernel,
-               &hipPerfSampleRate::float4_kernel};
-
-  // We compute a square domain
-  width_ = sizes[test % NUM_SIZES];
-  typeIdx_ = (test / NUM_SIZES) % NUM_TYPES;
-  bufSize_ = width_ * width_ * typeSizes[typeIdx_];
-  numBufs_ = (1 << (test / (NUM_SIZES * NUM_TYPES)));
-
-  void *  hOutPtr;
-  void *  dOutPtr;
-  void *  hInPtr[numBufs_];
-  void ** dPtr;
-  void *  dInPtr[numBufs_];
-
- outBufSize_ =
-      sizes[NUM_SIZES - 1] * sizes[NUM_SIZES - 1] * typeSizes[NUM_TYPES - 1];
-
-  // Allocate memory on the host and device
-  HIPCHECK(hipHostMalloc((void **)&hOutPtr, outBufSize_, hipHostMallocDefault));
-  setData((void *)hOutPtr, 0xdeadbeef);
-  HIPCHECK(hipMalloc((uint **)&dOutPtr, outBufSize_));
-
-  // Allocate 2D array in Device
-   hipMalloc((void **)&dPtr, numBufs_* sizeof(void *));
-
-  for (uint i = 0; i < numBufs_; i++) {
-    HIPCHECK(hipHostMalloc((void **)&hInPtr[i], bufSize_, hipHostMallocDefault));
-    HIPCHECK(hipMalloc((uint **)&dInPtr[i], bufSize_));
-    setData(hInPtr[i], 0x3f800000);
-  }
-
-  // Populate array of pointers with array addresses
-  hipMemcpy(dPtr, dInPtr, numBufs_* sizeof(void *), hipMemcpyHostToDevice);
-
-  // Copy memory from host to device
-  for (uint i = 0; i < numBufs_; i++) {
-  HIPCHECK(hipMemcpy(dInPtr[i], hInPtr[i], bufSize_, hipMemcpyHostToDevice));
-  }
-
-  HIPCHECK(hipMemcpy(dOutPtr, hOutPtr, outBufSize_, hipMemcpyHostToDevice));
-
-  // Prepare kernel launch parameters
-  // outBufSize_/sizeof(uint) - Grid size in 3D
-  int grids = 64;
-  int blocks = 64;
-  int threads_per_block  = 1;
-
-  unsigned int maxIter = MAX_ITERATIONS * (MAX_BUFS / numBufs_);
-  unsigned int sizeDW = width_ * width_;
-  unsigned int writeIt = 0;
-
-  int idx = 0;
-
-  if (!types[typeIdx_].compare("float")) {
-    idx = 0;
-  }
-  else if(!types[typeIdx_].compare("float2")) {
-          idx = 1;
-  }
-  else if(!types[typeIdx_].compare("float4")) {
-          idx = 2;
-  }
-
-
-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-  for (uint i = 0; i < maxIter; i++) {
-        (this->*p[idx]) ((void *)dOutPtr, sizeDW, writeIt, dPtr, numBufs_, grids, blocks,
-                          threads_per_block);
-  }
-
-  hipDeviceSynchronize();
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-
-  double perf = ((double)outBufSize_ * numBufs_ * (double)maxIter * (double)(1e-09)) /
-                          all_kernel_time.count();
-
-  cout << "Domain " << sizes[NUM_SIZES - 1] << "x"<< sizes[NUM_SIZES - 1] << " bufs "
-       << numBufs_ << " " << types[typeIdx_] << " " << width_<<"x"<<width_<< " (GB/s) "
-       << perf << endl;
-
-   HIPCHECK(hipFree(dOutPtr));
-
-   // Free host and device memory
-   for (uint i = 0; i < numBufs_; i++) {
-    HIPCHECK(hipFree(hInPtr[i]));
-    HIPCHECK(hipFree(dInPtr[i]));
-    }
-
-   HIPCHECK(hipFree(hOutPtr));
-   HIPCHECK(hipFree(dPtr));
-}
-
-
-void hipPerfSampleRate::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
-  for (unsigned int i = 0; i < bufSize_ / sizeof(unsigned int); i++) {
-      ptr2[i] = value;
-  }
-}
-
-
-void hipPerfSampleRate::checkData(uint *ptr) {
-  for (unsigned int i = 0; i < outBufSize_ / sizeof(float); i++) {
-    if (ptr[i] != (float)numBufs_) {
-      cout << "Data validation failed at "<< i << " Got "<< ptr[i] << ", expected "
-           << (float)numBufs_;
-      break;
-          }
-  }
-}
-
-
-int main(int argc, char* argv[]) {
-  hipPerfSampleRate sampleTypes;
-
-  sampleTypes.open();
-
-  for (unsigned int testCase = 0; testCase < 216 ; testCase+=36) {
-    sampleTypes.run(testCase);
-  }
-
-
-  passed();
-}
+/*
+ Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ */
+
+/* HIT_START
+ * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <iostream>
+#include <chrono>
+#include "test_common.h"
+#include <hip/hip_vector_types.h>
+#include <vector>
+
+using namespace std;
+
+#define NUM_TYPES 3
+vector<string> types= {"float", "float2", "float4"};
+vector<unsigned int> typeSizes = {4, 8, 16};
+
+#define NUM_SIZES 12
+vector<unsigned int> sizes = {1,  2,   4,   8,   16,   32,
+                              64, 128, 256, 512, 1024, 2048};
+
+#define NUM_BUFS 6
+#define MAX_BUFS (1 << (NUM_BUFS - 1))
+
+template <typename T>
+__global__ void sampleRate(T * outBuffer, unsigned int inBufSize, unsigned int writeIt,
+                           T **inBuffer, int numBufs) {
+
+  uint gid = (blockIdx.x * blockDim.x + threadIdx.x);
+  uint inputIdx = gid % inBufSize;
+
+  T tmp = (T)0.0f;
+  for(int i = 0; i < numBufs; i++) {
+    tmp += *(*(inBuffer+i)+inputIdx);
+  }
+
+  if (writeIt*(unsigned int)tmp.x) {
+    outBuffer[gid] = tmp;
+   }
+};
+
+template <typename T>
+__global__ void sampleRateFloat(T * outBuffer, unsigned int inBufSize, unsigned int writeIt,
+                                T ** inBuffer, int numBufs) {
+
+  uint gid = (blockIdx.x * blockDim.x + threadIdx.x);
+  uint inputIdx = gid % inBufSize;
+
+  T tmp = (T)0.0f;
+
+  for(int i = 0; i < numBufs; i++) {
+    tmp += *((*inBuffer+i)+inputIdx);
+  }
+
+  if (writeIt*(unsigned int)tmp) {
+    outBuffer[gid] = tmp;
+  }
+};
+
+class hipPerfSampleRate {
+  public:
+  hipPerfSampleRate();
+  ~hipPerfSampleRate();
+
+  void open(void);
+  void run(unsigned int testCase);
+  void close(void);
+
+  // array of funtion pointers
+  typedef void (hipPerfSampleRate::*funPtr)(void * outBuffer, unsigned int
+    inBufSize, unsigned int writeIt, void **inBuffer, int numBufs, int grids, int blocks,
+    int threads_per_block);
+
+  // Wrappers
+  void float_kernel(void * outBuffer, unsigned int
+    inBufSize, unsigned int writeIt, void **inBuffer, int numBufs, int grids, int blocks,
+    int threads_per_block);
+
+  void float2_kernel(void * outBuffer, unsigned int
+    inBufSize, unsigned int writeIt, void **inBuffer, int numBufs, int grids, int blocks,
+    int threads_per_block);
+
+  void float4_kernel(void * outBuffer, unsigned int
+    inBufSize, unsigned int writeIt, void **inBuffer, int numBufs, int grids, int blocks,
+    int threads_per_block);
+
+  private:
+  void setData(void *ptr, unsigned int value);
+  void checkData(uint *ptr);
+
+  unsigned int width_;
+  unsigned int bufSize_;
+  unsigned long long totalIters = 0;
+  int numCUs;
+
+  unsigned int outBufSize_;
+  static const unsigned int MAX_ITERATIONS = 25;
+  unsigned int numBufs_;
+  unsigned int typeIdx_;
+};
+
+
+hipPerfSampleRate::hipPerfSampleRate() {}
+
+hipPerfSampleRate::~hipPerfSampleRate() {}
+
+void hipPerfSampleRate::open(void) {
+
+  int nGpu = 0;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+   if (nGpu < 1) {
+  std::cout << "info: didn't find any GPU! skipping the test!\n";
+  passed();
+  return;
+  }
+
+  int deviceId = 0;
+  hipDeviceProp_t props = {0};
+  props = {0};
+  HIPCHECK(hipSetDevice(deviceId));
+  HIPCHECK(hipGetDeviceProperties(&props, deviceId));
+  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
+    << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
+    << std::endl;
+  numCUs = props.multiProcessorCount;
+  }
+
+
+void hipPerfSampleRate::close() {
+
+}
+
+
+// Wrappers for the kernel launches
+void hipPerfSampleRate::float_kernel(void * outBuffer, unsigned int inBufSize,
+                                       unsigned int writeIt, void **inBuffer,
+                                       int numBufs, int grids, int blocks, int threads_per_block) {
+
+  hipLaunchKernelGGL(sampleRateFloat<float>, dim3(grids, grids, grids), dim3 (blocks), 0, 0,
+                      (float*)outBuffer, inBufSize, writeIt, (float**)inBuffer, numBufs);
+
+}
+
+void hipPerfSampleRate::float2_kernel(void * outBuffer, unsigned int inBufSize,
+                                       unsigned int writeIt, void **inBuffer,
+                                       int grids, int blocks, int threads_per_block, int numBufs) {
+
+  hipLaunchKernelGGL(sampleRate<float2>, dim3(grids, grids, grids), dim3(blocks), 0, 0,
+                      (float2 *)outBuffer, inBufSize, writeIt, (float2**)inBuffer, numBufs);
+}
+
+void hipPerfSampleRate::float4_kernel(void * outBuffer, unsigned int inBufSize,
+                                       unsigned int writeIt, void **inBuffer,
+                                       int grids, int blocks, int threads_per_block, int numBufs) {
+
+  hipLaunchKernelGGL(sampleRate<float4>, dim3(grids, grids, grids), dim3(blocks), 0, 0,
+                      (float4 *) outBuffer, inBufSize, writeIt, (float4**)inBuffer, numBufs);
+}
+
+void hipPerfSampleRate::run(unsigned int test) {
+
+  funPtr p[] = {&hipPerfSampleRate::float_kernel, &hipPerfSampleRate::float2_kernel,
+               &hipPerfSampleRate::float4_kernel};
+
+  // We compute a square domain
+  width_ = sizes[test % NUM_SIZES];
+  typeIdx_ = (test / NUM_SIZES) % NUM_TYPES;
+  bufSize_ = width_ * width_ * typeSizes[typeIdx_];
+  numBufs_ = (1 << (test / (NUM_SIZES * NUM_TYPES)));
+
+  void *  hOutPtr;
+  void *  dOutPtr;
+  void *  hInPtr[numBufs_];
+  void ** dPtr;
+  void *  dInPtr[numBufs_];
+
+ outBufSize_ =
+      sizes[NUM_SIZES - 1] * sizes[NUM_SIZES - 1] * typeSizes[NUM_TYPES - 1];
+
+  // Allocate memory on the host and device
+  HIPCHECK(hipHostMalloc((void **)&hOutPtr, outBufSize_, hipHostMallocDefault));
+  setData((void *)hOutPtr, 0xdeadbeef);
+  HIPCHECK(hipMalloc((uint **)&dOutPtr, outBufSize_));
+
+  // Allocate 2D array in Device
+   hipMalloc((void **)&dPtr, numBufs_* sizeof(void *));
+
+  for (uint i = 0; i < numBufs_; i++) {
+    HIPCHECK(hipHostMalloc((void **)&hInPtr[i], bufSize_, hipHostMallocDefault));
+    HIPCHECK(hipMalloc((uint **)&dInPtr[i], bufSize_));
+    setData(hInPtr[i], 0x3f800000);
+  }
+
+  // Populate array of pointers with array addresses
+  hipMemcpy(dPtr, dInPtr, numBufs_* sizeof(void *), hipMemcpyHostToDevice);
+
+  // Copy memory from host to device
+  for (uint i = 0; i < numBufs_; i++) {
+  HIPCHECK(hipMemcpy(dInPtr[i], hInPtr[i], bufSize_, hipMemcpyHostToDevice));
+  }
+
+  HIPCHECK(hipMemcpy(dOutPtr, hOutPtr, outBufSize_, hipMemcpyHostToDevice));
+
+  // Prepare kernel launch parameters
+  // outBufSize_/sizeof(uint) - Grid size in 3D
+  int grids = 64;
+  int blocks = 64;
+  int threads_per_block  = 1;
+
+  unsigned int maxIter = MAX_ITERATIONS * (MAX_BUFS / numBufs_);
+  unsigned int sizeDW = width_ * width_;
+  unsigned int writeIt = 0;
+
+  int idx = 0;
+
+  if (!types[typeIdx_].compare("float")) {
+    idx = 0;
+  }
+  else if(!types[typeIdx_].compare("float2")) {
+          idx = 1;
+  }
+  else if(!types[typeIdx_].compare("float4")) {
+          idx = 2;
+  }
+
+
+  // Time the kernel execution
+  auto all_start = std::chrono::steady_clock::now();
+  for (uint i = 0; i < maxIter; i++) {
+        (this->*p[idx]) ((void *)dOutPtr, sizeDW, writeIt, dPtr, numBufs_, grids, blocks,
+                          threads_per_block);
+  }
+
+  hipDeviceSynchronize();
+  auto all_end = std::chrono::steady_clock::now();
+  std::chrono::duration<double> all_kernel_time = all_end - all_start;
+
+  double perf = ((double)outBufSize_ * numBufs_ * (double)maxIter * (double)(1e-09)) /
+                          all_kernel_time.count();
+
+  cout << "Domain " << sizes[NUM_SIZES - 1] << "x"<< sizes[NUM_SIZES - 1] << " bufs "
+       << numBufs_ << " " << types[typeIdx_] << " " << width_<<"x"<<width_<< " (GB/s) "
+       << perf << endl;
+
+   HIPCHECK(hipFree(dOutPtr));
+
+   // Free host and device memory
+   for (uint i = 0; i < numBufs_; i++) {
+    HIPCHECK(hipFree(hInPtr[i]));
+    HIPCHECK(hipFree(dInPtr[i]));
+    }
+
+   HIPCHECK(hipFree(hOutPtr));
+   HIPCHECK(hipFree(dPtr));
+}
+
+
+void hipPerfSampleRate::setData(void *ptr, unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  for (unsigned int i = 0; i < bufSize_ / sizeof(unsigned int); i++) {
+      ptr2[i] = value;
+  }
+}
+
+
+void hipPerfSampleRate::checkData(uint *ptr) {
+  for (unsigned int i = 0; i < outBufSize_ / sizeof(float); i++) {
+    if (ptr[i] != (float)numBufs_) {
+      cout << "Data validation failed at "<< i << " Got "<< ptr[i] << ", expected "
+           << (float)numBufs_;
+      break;
+          }
+  }
+}
+
+
+int main(int argc, char* argv[]) {
+  hipPerfSampleRate sampleTypes;
+
+  sampleTypes.open();
+
+  for (unsigned int testCase = 0; testCase < 216 ; testCase+=36) {
+    sampleTypes.run(testCase);
+  }
+
+
+  passed();
+}
@@ -1,289 +1,289 @@
-/*
- Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-
-typedef struct {
-  double x;
-  double y;
-  double width;
-} coordRec;
-
-static coordRec coords[] = {
-    {0.0, 0.0, 0.00001},         // All black
-};
-
-static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
-
-__global__ void mandelbrot(uint *out, uint width, float xPos,  float yPos, float xStep,
-                            float yStep, uint maxIter) {
-
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % width;
-  int j = tid / width;
-  float x0 = (float)(xPos + xStep*i);
-  float y0 = (float)(yPos + yStep*j);
-
-  float x = x0;
-  float y = y0;
-
-  uint iter = 0;
-  float tmp;
-  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
-    tmp = x;
-    x = fma(-y,y,fma(x,x,x0));
-    y = fma(2.0f*tmp,y,y0);
-  }
-
-  out[tid] = iter;
-};
-
-class hipPerfDeviceConcurrency {
-  public:
-  hipPerfDeviceConcurrency();
-  ~hipPerfDeviceConcurrency();
-
-  void setNumGpus(unsigned int num) {
-    numDevices = num;
-  }
-  unsigned int getNumGpus() {
-    return numDevices;
-  }
-
-  void open(void);
-  void close(void);
-  void run(unsigned int testCase, int numGpus);
-
-  private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
-
-  unsigned int numDevices;
-  unsigned int width_;
-  unsigned int bufSize;
-  unsigned int coordIdx;
-  unsigned long long totalIters = 0;
-};
-
-
-hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {}
-
-hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {}
-
-void hipPerfDeviceConcurrency::open(void) {
-
-
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  setNumGpus(nGpu);
-  if (nGpu < 1) {
-  std::cout << "info: didn't find any GPU! skipping the test!\n";
-  passed();
-  }
-
-
-}
-
-
-void hipPerfDeviceConcurrency::close() {
-}
-
-void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
-
-
-  static int deviceId;
-  uint * hPtr[numGpus];
-  uint * dPtr[numGpus];
-  hipStream_t streams[numGpus];
-  int numCUs[numGpus];
-  unsigned int maxIter[numGpus];
-  unsigned long long expectedIters[numGpus];
-
-  int threads, threads_per_block, blocks;
-  float xStep, yStep, xPos, yPos;
-
-  for(int i = 0; i < numGpus; i++) {
-
-  if(testCase != 0) {
-    deviceId = i;
-  }
-
-  HIPCHECK(hipSetDevice(deviceId));
-
-  hipDeviceProp_t props = {0};
-  HIPCHECK(hipGetDeviceProperties(&props, i));
-
-  if (testCase != 0) {
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-            << " with " << props.multiProcessorCount << " CUs" << " and device ID: "
-            << i << std::endl;
-  }
-
-  numCUs[i] = props.multiProcessorCount;
-  int clkFrequency = 0;
-  HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i));
-
-  clkFrequency =(unsigned int)clkFrequency/1000;
-
-  // Maximum iteration count
-  // maxIter = 8388608 * (engine_clock / 1000).serial execution
-  maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128);
-  maxIter[i] = (maxIter[i] + 15) & ~15;
-
-  // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
-  width_ = 256;
-
-  bufSize = width_ * width_ * sizeof(uint);
-
-  // Create streams for concurrency
-  HIPCHECK(hipStreamCreate(&streams[i]));
-
-  // Allocate memory on the host and device
-  HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
-  setData(hPtr[i], 0xdeadbeef);
-  HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
-
-  // Prepare kernel launch parameters
-  threads = (bufSize/sizeof(uint));
-  threads_per_block  = 64;
-  blocks = (threads/threads_per_block) + (threads % threads_per_block);
-
-  coordIdx = testCase % numCoords;
-  xStep = (float)(coords[coordIdx].width / (double)width_);
-  yStep = (float)(-coords[coordIdx].width / (double)width_);
-  xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
-  yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
-
-  // Copy memory from host to device
-  HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
-
-  }
-
-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-
-  for(int i = 0; i < numGpus; i++) {
-
-  if(testCase != 0) {
-    deviceId = i;
-  }
-
-  HIPCHECK(hipSetDevice(deviceId));
-
-  hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i],
-                      dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]);
-
-  }
-
-  for(int i = 0; i < numGpus; i++) {
-    HIPCHECK(hipStreamSynchronize(0));
-  }
-
-
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-
-  for(int i = 0; i < numGpus; i++) {
-
-  if(testCase != 0) {
-    deviceId = i;
-  }
-  HIPCHECK(hipSetDevice(deviceId));
-
-  // Copy data back from device to the host
-  HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
-
-  checkData(hPtr[i]);
-  expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i];
-
-  if (testCase != 0) {
-    checkData(hPtr[i]);
-    if(totalIters != expectedIters[i]) {
-      std::cout << "Incorrect iteration count detected" << std::endl;
-    }
-  }
-
-
-  HIPCHECK(hipStreamDestroy(streams[i]));
-
-  // Free host and device memory
-  HIPCHECK(hipFree(hPtr[i]));
-  HIPCHECK(hipFree(dPtr[i]));
-  }
-
-  if (testCase != 0) {
-  std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): "
-            << all_kernel_time.count() << " (s) " << '\n' << std::endl;
-  }
-
-  if(testCase == 0) {
-    deviceId++;
-  }
-
-
-}
-
-
-void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
-  for (unsigned int i = 0; i < width_ * width_ ; i++) {
-      ptr2[i] = value;
-  }
-}
-
-
-void hipPerfDeviceConcurrency::checkData(uint *ptr) {
-  totalIters = 0;
-  for (unsigned int i = 0; i < width_ * width_; i++) {
-    totalIters += ptr[i];
-  }
-}
-
-
-int main(int argc, char* argv[]) {
-  hipPerfDeviceConcurrency deviceConcurrency;
-
-  deviceConcurrency.open();
-
-  int nGpu = deviceConcurrency.getNumGpus();
-
-  // testCase = 0 refers to warmup kernel run
-  int testCase = 0;
-
-  for (int i = 0; i < nGpu; i++) {
-    // Warm-up kernel on all devices
-    deviceConcurrency.run(testCase, 1);
-  }
-
-  // Time for kernel on 1 device
-  deviceConcurrency.run(++testCase, 1);
-
-  // Time for kernel on all available devices
-  deviceConcurrency.run(++testCase, nGpu);
-
-  passed();
-}
+/*
+ Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ */
+
+/* HIT_START
+ * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <iostream>
+#include <chrono>
+#include "test_common.h"
+
+typedef struct {
+  double x;
+  double y;
+  double width;
+} coordRec;
+
+static coordRec coords[] = {
+    {0.0, 0.0, 0.00001},         // All black
+};
+
+static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
+
+__global__ void mandelbrot(uint *out, uint width, float xPos,  float yPos, float xStep,
+                            float yStep, uint maxIter) {
+
+  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
+  int i = tid % width;
+  int j = tid / width;
+  float x0 = (float)(xPos + xStep*i);
+  float y0 = (float)(yPos + yStep*j);
+
+  float x = x0;
+  float y = y0;
+
+  uint iter = 0;
+  float tmp;
+  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
+    tmp = x;
+    x = fma(-y,y,fma(x,x,x0));
+    y = fma(2.0f*tmp,y,y0);
+  }
+
+  out[tid] = iter;
+};
+
+class hipPerfDeviceConcurrency {
+  public:
+  hipPerfDeviceConcurrency();
+  ~hipPerfDeviceConcurrency();
+
+  void setNumGpus(unsigned int num) {
+    numDevices = num;
+  }
+  unsigned int getNumGpus() {
+    return numDevices;
+  }
+
+  void open(void);
+  void close(void);
+  void run(unsigned int testCase, int numGpus);
+
+  private:
+  void setData(void *ptr, unsigned int value);
+  void checkData(uint *ptr);
+
+  unsigned int numDevices;
+  unsigned int width_;
+  unsigned int bufSize;
+  unsigned int coordIdx;
+  unsigned long long totalIters = 0;
+};
+
+
+hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {}
+
+hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {}
+
+void hipPerfDeviceConcurrency::open(void) {
+
+
+  int nGpu = 0;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+  setNumGpus(nGpu);
+  if (nGpu < 1) {
+  std::cout << "info: didn't find any GPU! skipping the test!\n";
+  passed();
+  }
+
+
+}
+
+
+void hipPerfDeviceConcurrency::close() {
+}
+
+void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
+
+
+  static int deviceId;
+  uint * hPtr[numGpus];
+  uint * dPtr[numGpus];
+  hipStream_t streams[numGpus];
+  int numCUs[numGpus];
+  unsigned int maxIter[numGpus];
+  unsigned long long expectedIters[numGpus];
+
+  int threads, threads_per_block, blocks;
+  float xStep, yStep, xPos, yPos;
+
+  for(int i = 0; i < numGpus; i++) {
+
+  if(testCase != 0) {
+    deviceId = i;
+  }
+
+  HIPCHECK(hipSetDevice(deviceId));
+
+  hipDeviceProp_t props = {0};
+  HIPCHECK(hipGetDeviceProperties(&props, i));
+
+  if (testCase != 0) {
+  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
+            << " with " << props.multiProcessorCount << " CUs" << " and device ID: "
+            << i << std::endl;
+  }
+
+  numCUs[i] = props.multiProcessorCount;
+  int clkFrequency = 0;
+  HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i));
+
+  clkFrequency =(unsigned int)clkFrequency/1000;
+
+  // Maximum iteration count
+  // maxIter = 8388608 * (engine_clock / 1000).serial execution
+  maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128);
+  maxIter[i] = (maxIter[i] + 15) & ~15;
+
+  // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
+  width_ = 256;
+
+  bufSize = width_ * width_ * sizeof(uint);
+
+  // Create streams for concurrency
+  HIPCHECK(hipStreamCreate(&streams[i]));
+
+  // Allocate memory on the host and device
+  HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
+  setData(hPtr[i], 0xdeadbeef);
+  HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
+
+  // Prepare kernel launch parameters
+  threads = (bufSize/sizeof(uint));
+  threads_per_block  = 64;
+  blocks = (threads/threads_per_block) + (threads % threads_per_block);
+
+  coordIdx = testCase % numCoords;
+  xStep = (float)(coords[coordIdx].width / (double)width_);
+  yStep = (float)(-coords[coordIdx].width / (double)width_);
+  xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
+  yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
+
+  // Copy memory from host to device
+  HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
+
+  }
+
+  // Time the kernel execution
+  auto all_start = std::chrono::steady_clock::now();
+
+  for(int i = 0; i < numGpus; i++) {
+
+  if(testCase != 0) {
+    deviceId = i;
+  }
+
+  HIPCHECK(hipSetDevice(deviceId));
+
+  hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i],
+                      dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]);
+
+  }
+
+  for(int i = 0; i < numGpus; i++) {
+    HIPCHECK(hipStreamSynchronize(0));
+  }
+
+
+  auto all_end = std::chrono::steady_clock::now();
+  std::chrono::duration<double> all_kernel_time = all_end - all_start;
+
+  for(int i = 0; i < numGpus; i++) {
+
+  if(testCase != 0) {
+    deviceId = i;
+  }
+  HIPCHECK(hipSetDevice(deviceId));
+
+  // Copy data back from device to the host
+  HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
+
+  checkData(hPtr[i]);
+  expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i];
+
+  if (testCase != 0) {
+    checkData(hPtr[i]);
+    if(totalIters != expectedIters[i]) {
+      std::cout << "Incorrect iteration count detected" << std::endl;
+    }
+  }
+
+
+  HIPCHECK(hipStreamDestroy(streams[i]));
+
+  // Free host and device memory
+  HIPCHECK(hipFree(hPtr[i]));
+  HIPCHECK(hipFree(dPtr[i]));
+  }
+
+  if (testCase != 0) {
+  std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): "
+            << all_kernel_time.count() << " (s) " << '\n' << std::endl;
+  }
+
+  if(testCase == 0) {
+    deviceId++;
+  }
+
+
+}
+
+
+void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  for (unsigned int i = 0; i < width_ * width_ ; i++) {
+      ptr2[i] = value;
+  }
+}
+
+
+void hipPerfDeviceConcurrency::checkData(uint *ptr) {
+  totalIters = 0;
+  for (unsigned int i = 0; i < width_ * width_; i++) {
+    totalIters += ptr[i];
+  }
+}
+
+
+int main(int argc, char* argv[]) {
+  hipPerfDeviceConcurrency deviceConcurrency;
+
+  deviceConcurrency.open();
+
+  int nGpu = deviceConcurrency.getNumGpus();
+
+  // testCase = 0 refers to warmup kernel run
+  int testCase = 0;
+
+  for (int i = 0; i < nGpu; i++) {
+    // Warm-up kernel on all devices
+    deviceConcurrency.run(testCase, 1);
+  }
+
+  // Time for kernel on 1 device
+  deviceConcurrency.run(++testCase, 1);
+
+  // Time for kernel on all available devices
+  deviceConcurrency.run(++testCase, nGpu);
+
+  passed();
+}
@@ -1,157 +1,157 @@
-#pragma once
-
-#ifdef __unix__
-
-#include <string>
-#include <atomic>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/mman.h>
-
-template <typename T>
-struct Shmem {
-  std::atomic<T> handle_;
-  std::atomic<int> done_counter_;
-};
-
-template <typename T>
-struct ShmemMeta {
-  std::string shmem_name_;
-  int shmem_fd_;
-  Shmem<T>* shmem_;
-};
-
-template <typename T>
-class MultiProcess {
-public:
-  MultiProcess(size_t num_proc) : num_proc_(num_proc) {}
-  ~MultiProcess();
-
-  void DebugInfo(pid_t pid);
-
-  pid_t SpawnProcess(bool debug_bkpt);
-  bool CreateShmem();
-
-  bool WriteHandleToShmem(T ipc_handle);
-  bool WaitTillAllChildReads();
-
-  bool ReadHandleFromShmem(T& ipc_handle);
-  bool NotifyParentDone();
-
-private:
-  const size_t num_proc_;
-  bool debug_proc_;
-  ShmemMeta<T> shmem_meta_obj_;
-};
-
-// Template Implementations
-template <typename T>
-MultiProcess<T>::~MultiProcess() {
-  if(munmap(shmem_meta_obj_.shmem_, sizeof(Shmem<T>)) < 0) {
-    std::cout<<"Error Unmapping shared memory "<<std::endl;
-    exit(0);
-  }
-}
-
-template <typename T>
-void MultiProcess<T>::DebugInfo(pid_t pid) {
-  const int delay = 1;
-
-  if (pid == 0) {
-    std::cout<<" Child Process with ID: "<<getpid()<<std::endl;
-  } else {
-    std::cout<<" Parent Process with ID: "<<getpid()<<std::endl;
-  }
-
-  volatile int flag = 0;
-  while (!flag) {
-    sleep(delay);
-  }
-}
-
-template <typename T>
-pid_t MultiProcess<T>::SpawnProcess(bool debug_bkpt) {
-  if (num_proc_ < 0) {
-    std::cout<<"Num Process cannot be less than 1"<<std::endl;
-    return -1;
-  }
-
-  pid_t pid;
-  for (size_t proc_idx = 0; proc_idx < num_proc_; ++proc_idx) {
-    pid = fork();
-    if (pid < 0) {
-      std::cout<<"Fork Failed"<<std::endl;
-      assert(false);
-    } else if (pid == 0) {
-      //Child Process, so break
-      break;
-    }
-  }
-
-  if (debug_bkpt) {
-    DebugInfo(pid);
-  }
-
-  return pid;
-}
-
-template <typename T>
-bool MultiProcess<T>::CreateShmem() {
-  if (num_proc_ < 0) {
-    std::cout<<"Num Process cannot be less than 1"<<std::endl;
-    return false;
-  }
-
-  char name_template[] = "/tmp/eventXXXXX";
-  int temp_fd = mkstemp(name_template);
-  shmem_meta_obj_.shmem_name_ = name_template;
-  shmem_meta_obj_.shmem_name_.replace(0, 5, "/hip_");
-  shmem_meta_obj_.shmem_fd_ = shm_open(shmem_meta_obj_.shmem_name_.c_str(),
-                                       O_RDWR | O_CREAT, 0777);
-
-  if (ftruncate(shmem_meta_obj_.shmem_fd_, sizeof(ShmemMeta<T>)) != 0) {
-    std::cout<<"Cannot FTruncate "<<std::endl;
-    exit(0);
-  }
-
-  shmem_meta_obj_.shmem_ = (Shmem<T>*)mmap(0, sizeof(Shmem<T>), PROT_READ | PROT_WRITE,
-                                           MAP_SHARED, shmem_meta_obj_.shmem_fd_, 0);
-  memset(&shmem_meta_obj_.shmem_->handle_, 0x00, sizeof(T));
-  shmem_meta_obj_.shmem_->done_counter_ = -1;
-
-  return true;
-}
-
-template <typename T>
-bool MultiProcess<T>::WriteHandleToShmem(T ipc_handle) {
-  memcpy(&shmem_meta_obj_.shmem_->handle_, &ipc_handle, sizeof(T));
-  shmem_meta_obj_.shmem_->done_counter_ = 0;
-  return true;
-}
-
-template <typename T>
-bool MultiProcess<T>::WaitTillAllChildReads() {
-  size_t write_count = 0;
-  while (shmem_meta_obj_.shmem_->done_counter_ != num_proc_) {
-    ++write_count;
-  }
-  return true;
-}
-
-template <typename T>
-bool MultiProcess<T>::ReadHandleFromShmem(T& ipc_handle) {
-  size_t read_count = 0;
-  while (shmem_meta_obj_.shmem_->done_counter_ == -1) {
-    ++read_count;
-  }
-  memcpy(&ipc_handle, &shmem_meta_obj_.shmem_->handle_, sizeof(T));
-  return true;
-}
-
-template <typename T>
-bool MultiProcess<T>::NotifyParentDone() {
-  ++shmem_meta_obj_.shmem_->done_counter_;
-  return true;
-}
-
-#endif /* __unix__ */
+#pragma once
+
+#ifdef __unix__
+
+#include <string>
+#include <atomic>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+
+template <typename T>
+struct Shmem {
+  std::atomic<T> handle_;
+  std::atomic<int> done_counter_;
+};
+
+template <typename T>
+struct ShmemMeta {
+  std::string shmem_name_;
+  int shmem_fd_;
+  Shmem<T>* shmem_;
+};
+
+template <typename T>
+class MultiProcess {
+public:
+  MultiProcess(size_t num_proc) : num_proc_(num_proc) {}
+  ~MultiProcess();
+
+  void DebugInfo(pid_t pid);
+
+  pid_t SpawnProcess(bool debug_bkpt);
+  bool CreateShmem();
+
+  bool WriteHandleToShmem(T ipc_handle);
+  bool WaitTillAllChildReads();
+
+  bool ReadHandleFromShmem(T& ipc_handle);
+  bool NotifyParentDone();
+
+private:
+  const size_t num_proc_;
+  bool debug_proc_;
+  ShmemMeta<T> shmem_meta_obj_;
+};
+
+// Template Implementations
+template <typename T>
+MultiProcess<T>::~MultiProcess() {
+  if(munmap(shmem_meta_obj_.shmem_, sizeof(Shmem<T>)) < 0) {
+    std::cout<<"Error Unmapping shared memory "<<std::endl;
+    exit(0);
+  }
+}
+
+template <typename T>
+void MultiProcess<T>::DebugInfo(pid_t pid) {
+  const int delay = 1;
+
+  if (pid == 0) {
+    std::cout<<" Child Process with ID: "<<getpid()<<std::endl;
+  } else {
+    std::cout<<" Parent Process with ID: "<<getpid()<<std::endl;
+  }
+
+  volatile int flag = 0;
+  while (!flag) {
+    sleep(delay);
+  }
+}
+
+template <typename T>
+pid_t MultiProcess<T>::SpawnProcess(bool debug_bkpt) {
+  if (num_proc_ < 0) {
+    std::cout<<"Num Process cannot be less than 1"<<std::endl;
+    return -1;
+  }
+
+  pid_t pid;
+  for (size_t proc_idx = 0; proc_idx < num_proc_; ++proc_idx) {
+    pid = fork();
+    if (pid < 0) {
+      std::cout<<"Fork Failed"<<std::endl;
+      assert(false);
+    } else if (pid == 0) {
+      //Child Process, so break
+      break;
+    }
+  }
+
+  if (debug_bkpt) {
+    DebugInfo(pid);
+  }
+
+  return pid;
+}
+
+template <typename T>
+bool MultiProcess<T>::CreateShmem() {
+  if (num_proc_ < 0) {
+    std::cout<<"Num Process cannot be less than 1"<<std::endl;
+    return false;
+  }
+
+  char name_template[] = "/tmp/eventXXXXX";
+  int temp_fd = mkstemp(name_template);
+  shmem_meta_obj_.shmem_name_ = name_template;
+  shmem_meta_obj_.shmem_name_.replace(0, 5, "/hip_");
+  shmem_meta_obj_.shmem_fd_ = shm_open(shmem_meta_obj_.shmem_name_.c_str(),
+                                       O_RDWR | O_CREAT, 0777);
+
+  if (ftruncate(shmem_meta_obj_.shmem_fd_, sizeof(ShmemMeta<T>)) != 0) {
+    std::cout<<"Cannot FTruncate "<<std::endl;
+    exit(0);
+  }
+
+  shmem_meta_obj_.shmem_ = (Shmem<T>*)mmap(0, sizeof(Shmem<T>), PROT_READ | PROT_WRITE,
+                                           MAP_SHARED, shmem_meta_obj_.shmem_fd_, 0);
+  memset(&shmem_meta_obj_.shmem_->handle_, 0x00, sizeof(T));
+  shmem_meta_obj_.shmem_->done_counter_ = -1;
+
+  return true;
+}
+
+template <typename T>
+bool MultiProcess<T>::WriteHandleToShmem(T ipc_handle) {
+  memcpy(&shmem_meta_obj_.shmem_->handle_, &ipc_handle, sizeof(T));
+  shmem_meta_obj_.shmem_->done_counter_ = 0;
+  return true;
+}
+
+template <typename T>
+bool MultiProcess<T>::WaitTillAllChildReads() {
+  size_t write_count = 0;
+  while (shmem_meta_obj_.shmem_->done_counter_ != num_proc_) {
+    ++write_count;
+  }
+  return true;
+}
+
+template <typename T>
+bool MultiProcess<T>::ReadHandleFromShmem(T& ipc_handle) {
+  size_t read_count = 0;
+  while (shmem_meta_obj_.shmem_->done_counter_ == -1) {
+    ++read_count;
+  }
+  memcpy(&ipc_handle, &shmem_meta_obj_.shmem_->handle_, sizeof(T));
+  return true;
+}
+
+template <typename T>
+bool MultiProcess<T>::NotifyParentDone() {
+  ++shmem_meta_obj_.shmem_->done_counter_;
+  return true;
+}
+
+#endif /* __unix__ */
@@ -1,126 +1,126 @@
-/*
-Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/* HIT_START
- * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
- * TEST: %t
- * HIT_END
- */
-
-#include "test_common.h"
-#include "MultiProcess.h"
-
-void multi_process(int num_process, bool debug_process) {
-
-#ifdef __unix__
-
-  float *A_h, *B_h, *C_h;
-  float *A_d, *B_d, *C_d;
-  hipEvent_t start, stop;
-  size_t Nbytes = N * sizeof(float);
-
-  MultiProcess<hipIpcEventHandle_t>* mProcess = new MultiProcess<hipIpcEventHandle_t>(num_process);
-  mProcess->CreateShmem();
-  pid_t pid = mProcess->SpawnProcess(debug_process);
-
-  // Parent Process
-  if (pid != 0) {
-
-    unsigned blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
-    if (blocks > 1024) blocks = 1024;
-    if (blocks == 0) blocks = 1;
-
-    printf("N=%zu (A+B+C= %6.1f MB total) blocks=%u threadsPerBlock=%u iterations=%d\n", N,
-           ((double)3 * N * sizeof(float)) / 1024 / 1024, blocks, threadsPerBlock, iterations);
-    printf("iterations=%d\n", iterations);
-
-    HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N);
-
-    // NULL stream check:
-    HIPCHECK(hipEventCreateWithFlags(&start, hipEventDisableTiming|hipEventInterprocess));
-    HIPCHECK(hipEventCreateWithFlags(&stop, hipEventDisableTiming|hipEventInterprocess));
-
-
-    HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
-    HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
-
-
-    for (int i = 0; i < iterations; i++) {
-      //--- START TIMED REGION
-      long long hostStart = HipTest::get_time();
-      // Record the start event
-      HIPCHECK(hipEventRecord(start, NULL));
-
-      hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
-                        static_cast<const float*>(A_d), static_cast<const float*>(B_d), C_d, N);
-
-
-      HIPCHECK(hipEventRecord(stop, NULL));
-      HIPCHECK(hipEventSynchronize(stop));
-      HIPCHECK(hipEventQuery(stop));
-      long long hostStop = HipTest::get_time();
-      //--- STOP TIMED REGION
-
-      float eventMs = 1.0f;
-      // should fail
-      HIPASSERT(hipSuccess != hipEventElapsedTime(&eventMs, start, stop));
-      float hostMs = HipTest::elapsed_time(hostStart, hostStop);
-
-      printf("host_time (gettimeofday)          =%6.3fms\n", hostMs);
-      printf("kernel_time (hipEventElapsedTime) =%6.3fms\n", eventMs);
-      printf("\n");
-
-    }
-
-    hipIpcEventHandle_t ipc_handle;
-    HIPCHECK(hipIpcGetEventHandle(&ipc_handle, start));
-
-    mProcess->WriteHandleToShmem(ipc_handle);
-    mProcess->WaitTillAllChildReads();
-
-  } else {
-    hipEvent_t ipc_event;
-    hipIpcEventHandle_t ipc_handle;
-    mProcess->ReadHandleFromShmem(ipc_handle);
-    HIPCHECK(hipIpcOpenEventHandle(&ipc_event, ipc_handle));
-
-    HIPCHECK(hipEventSynchronize(ipc_event));
-    HIPCHECK(hipEventDestroy(ipc_event));
-    mProcess->NotifyParentDone();
-  }
-
-  if (pid != 0) {
-    HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
-    printf("check:\n");
-    HipTest::checkVectorADD(A_h, B_h, C_h, N, true);
-
-    HIPCHECK(hipEventDestroy(start));
-    HIPCHECK(hipEventDestroy(stop));
-    delete mProcess;
-  }
-
-#endif /* __unix__ */
-
-}
-
-int main(int argc, char* argv[]) {
-  HipTest::parseStandardArguments(argc, argv, true);
-  multi_process((N < 64) ? N : 64, debug_test);
-  passed();
-}
+/*
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "test_common.h"
+#include "MultiProcess.h"
+
+void multi_process(int num_process, bool debug_process) {
+
+#ifdef __unix__
+
+  float *A_h, *B_h, *C_h;
+  float *A_d, *B_d, *C_d;
+  hipEvent_t start, stop;
+  size_t Nbytes = N * sizeof(float);
+
+  MultiProcess<hipIpcEventHandle_t>* mProcess = new MultiProcess<hipIpcEventHandle_t>(num_process);
+  mProcess->CreateShmem();
+  pid_t pid = mProcess->SpawnProcess(debug_process);
+
+  // Parent Process
+  if (pid != 0) {
+
+    unsigned blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
+    if (blocks > 1024) blocks = 1024;
+    if (blocks == 0) blocks = 1;
+
+    printf("N=%zu (A+B+C= %6.1f MB total) blocks=%u threadsPerBlock=%u iterations=%d\n", N,
+           ((double)3 * N * sizeof(float)) / 1024 / 1024, blocks, threadsPerBlock, iterations);
+    printf("iterations=%d\n", iterations);
+
+    HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N);
+
+    // NULL stream check:
+    HIPCHECK(hipEventCreateWithFlags(&start, hipEventDisableTiming|hipEventInterprocess));
+    HIPCHECK(hipEventCreateWithFlags(&stop, hipEventDisableTiming|hipEventInterprocess));
+
+
+    HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
+    HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
+
+
+    for (int i = 0; i < iterations; i++) {
+      //--- START TIMED REGION
+      long long hostStart = HipTest::get_time();
+      // Record the start event
+      HIPCHECK(hipEventRecord(start, NULL));
+
+      hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+                        static_cast<const float*>(A_d), static_cast<const float*>(B_d), C_d, N);
+
+
+      HIPCHECK(hipEventRecord(stop, NULL));
+      HIPCHECK(hipEventSynchronize(stop));
+      HIPCHECK(hipEventQuery(stop));
+      long long hostStop = HipTest::get_time();
+      //--- STOP TIMED REGION
+
+      float eventMs = 1.0f;
+      // should fail
+      HIPASSERT(hipSuccess != hipEventElapsedTime(&eventMs, start, stop));
+      float hostMs = HipTest::elapsed_time(hostStart, hostStop);
+
+      printf("host_time (gettimeofday)          =%6.3fms\n", hostMs);
+      printf("kernel_time (hipEventElapsedTime) =%6.3fms\n", eventMs);
+      printf("\n");
+
+    }
+
+    hipIpcEventHandle_t ipc_handle;
+    HIPCHECK(hipIpcGetEventHandle(&ipc_handle, start));
+
+    mProcess->WriteHandleToShmem(ipc_handle);
+    mProcess->WaitTillAllChildReads();
+
+  } else {
+    hipEvent_t ipc_event;
+    hipIpcEventHandle_t ipc_handle;
+    mProcess->ReadHandleFromShmem(ipc_handle);
+    HIPCHECK(hipIpcOpenEventHandle(&ipc_event, ipc_handle));
+
+    HIPCHECK(hipEventSynchronize(ipc_event));
+    HIPCHECK(hipEventDestroy(ipc_event));
+    mProcess->NotifyParentDone();
+  }
+
+  if (pid != 0) {
+    HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
+    printf("check:\n");
+    HipTest::checkVectorADD(A_h, B_h, C_h, N, true);
+
+    HIPCHECK(hipEventDestroy(start));
+    HIPCHECK(hipEventDestroy(stop));
+    delete mProcess;
+  }
+
+#endif /* __unix__ */
+
+}
+
+int main(int argc, char* argv[]) {
+  HipTest::parseStandardArguments(argc, argv, true);
+  multi_process((N < 64) ? N : 64, debug_test);
+  passed();
+}
@@ -1,103 +1,103 @@
-/*
-Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/* HIT_START
- * BUILD: %t %s ../test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include "test_common.h"
-#include "MultiProcess.h"
-
-#define NUM_ELEMS 1024
-#define OFFSET 128
-
-void multi_process(int num_process, bool debug_process) {
-
-#ifdef __unix__
-
-  int* ipc_dptr = nullptr;
-  int* ipc_hptr = nullptr;
-  int* ipc_out_dptr = nullptr;
-  int* ipc_out_hptr = nullptr;
-  int* ipc_offset_dptr = nullptr;
-
-  MultiProcess<hipIpcMemHandle_t>* mProcess = new MultiProcess<hipIpcMemHandle_t>(num_process);
-  mProcess->CreateShmem();
-  pid_t pid = mProcess->SpawnProcess(debug_process);
-
-  // Parent Process
-  if (pid != 0) {
-    hipIpcMemHandle_t ipc_handle;
-    memset(&ipc_handle, 0x00, sizeof(hipIpcMemHandle_t));
-
-    HIPCHECK(hipMalloc((void**)&ipc_dptr, NUM_ELEMS * sizeof(int)));
-    // Add offset to the dev_ptr
-    ipc_offset_dptr = ipc_dptr + OFFSET;
-    // Get handle for the offsetted device_ptr
-    HIPCHECK(hipIpcGetMemHandle(&ipc_handle, ipc_offset_dptr));
-
-    ipc_hptr = new int[NUM_ELEMS];
-    for (size_t idx = 0; idx < NUM_ELEMS; ++idx) {
-      ipc_hptr[idx] = idx;
-    }
-
-    HIPCHECK(hipMemset(ipc_dptr, 0x00, (NUM_ELEMS * sizeof(int))));
-    HIPCHECK(hipMemcpy(ipc_dptr, ipc_hptr, (NUM_ELEMS * sizeof(int)), hipMemcpyHostToDevice));
-
-    mProcess->WriteHandleToShmem(ipc_handle);
-
-    mProcess->WaitTillAllChildReads();
-
-  } else {
-    ipc_out_hptr = new int[NUM_ELEMS];
-    memset(ipc_out_hptr, 0x00, (NUM_ELEMS * sizeof(int)));
-
-    hipIpcMemHandle_t ipc_handle;
-    mProcess->ReadHandleFromShmem(ipc_handle);
-    // Open handle to get dev_ptr
-    HIPCHECK(hipIpcOpenMemHandle((void**)&ipc_out_dptr, ipc_handle, hipIpcMemLazyEnablePeerAccess));
-
-    HIPCHECK(hipMemcpy(ipc_out_hptr, ipc_out_dptr, (NUM_ELEMS * sizeof(int)),
-                       hipMemcpyDeviceToHost));
-    for (size_t idx = 0; idx < NUM_ELEMS; ++idx) {
-      if (ipc_out_hptr[idx] != idx) {
-        std::cout<<"Failing @ idx: "<< idx << std::endl;
-      }
-    }
-    mProcess->NotifyParentDone();
-    HIPCHECK(hipIpcCloseMemHandle(ipc_out_dptr));
-    delete[] ipc_out_hptr;
-  }
-
-  if (pid != 0) {
-    delete mProcess;
-  }
-
-#endif /* __unix__ */
-
-}
-
-
-int main(int argc, char* argv[]) {
-  HipTest::parseStandardArguments(argc, argv, true);
-  multi_process((N < 64) ? N : 64, debug_test);
-  passed();
-}
+/*
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "test_common.h"
+#include "MultiProcess.h"
+
+#define NUM_ELEMS 1024
+#define OFFSET 128
+
+void multi_process(int num_process, bool debug_process) {
+
+#ifdef __unix__
+
+  int* ipc_dptr = nullptr;
+  int* ipc_hptr = nullptr;
+  int* ipc_out_dptr = nullptr;
+  int* ipc_out_hptr = nullptr;
+  int* ipc_offset_dptr = nullptr;
+
+  MultiProcess<hipIpcMemHandle_t>* mProcess = new MultiProcess<hipIpcMemHandle_t>(num_process);
+  mProcess->CreateShmem();
+  pid_t pid = mProcess->SpawnProcess(debug_process);
+
+  // Parent Process
+  if (pid != 0) {
+    hipIpcMemHandle_t ipc_handle;
+    memset(&ipc_handle, 0x00, sizeof(hipIpcMemHandle_t));
+
+    HIPCHECK(hipMalloc((void**)&ipc_dptr, NUM_ELEMS * sizeof(int)));
+    // Add offset to the dev_ptr
+    ipc_offset_dptr = ipc_dptr + OFFSET;
+    // Get handle for the offsetted device_ptr
+    HIPCHECK(hipIpcGetMemHandle(&ipc_handle, ipc_offset_dptr));
+
+    ipc_hptr = new int[NUM_ELEMS];
+    for (size_t idx = 0; idx < NUM_ELEMS; ++idx) {
+      ipc_hptr[idx] = idx;
+    }
+
+    HIPCHECK(hipMemset(ipc_dptr, 0x00, (NUM_ELEMS * sizeof(int))));
+    HIPCHECK(hipMemcpy(ipc_dptr, ipc_hptr, (NUM_ELEMS * sizeof(int)), hipMemcpyHostToDevice));
+
+    mProcess->WriteHandleToShmem(ipc_handle);
+
+    mProcess->WaitTillAllChildReads();
+
+  } else {
+    ipc_out_hptr = new int[NUM_ELEMS];
+    memset(ipc_out_hptr, 0x00, (NUM_ELEMS * sizeof(int)));
+
+    hipIpcMemHandle_t ipc_handle;
+    mProcess->ReadHandleFromShmem(ipc_handle);
+    // Open handle to get dev_ptr
+    HIPCHECK(hipIpcOpenMemHandle((void**)&ipc_out_dptr, ipc_handle, hipIpcMemLazyEnablePeerAccess));
+
+    HIPCHECK(hipMemcpy(ipc_out_hptr, ipc_out_dptr, (NUM_ELEMS * sizeof(int)),
+                       hipMemcpyDeviceToHost));
+    for (size_t idx = 0; idx < NUM_ELEMS; ++idx) {
+      if (ipc_out_hptr[idx] != idx) {
+        std::cout<<"Failing @ idx: "<< idx << std::endl;
+      }
+    }
+    mProcess->NotifyParentDone();
+    HIPCHECK(hipIpcCloseMemHandle(ipc_out_dptr));
+    delete[] ipc_out_hptr;
+  }
+
+  if (pid != 0) {
+    delete mProcess;
+  }
+
+#endif /* __unix__ */
+
+}
+
+
+int main(int argc, char* argv[]) {
+  HipTest::parseStandardArguments(argc, argv, true);
+  multi_process((N < 64) ? N : 64, debug_test);
+  passed();
+}
@@ -1,49 +1,49 @@
-/*
-Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <stdio.h>
-#include "hip/hip_runtime.h"
-#include "test_common.h"
-
-int main(void) {
-    hipDeviceProp_t prop;
-    int dev;
-
-    hipGetDevice(&dev);
-    printf("ID of current HIP device:  %d\n", dev);
-
-    memset(&prop, 0, sizeof(hipDeviceProp_t));
-    prop.major = 1;
-    prop.minor = 3;
-    hipChooseDevice(&dev, &prop);
-    printf("ID of hip device closest to revision 1.3:  %d\n", dev);
-
-    hipSetDevice(dev);
-
-    passed();
-}
+/*
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <stdio.h>
+#include "hip/hip_runtime.h"
+#include "test_common.h"
+
+int main(void) {
+    hipDeviceProp_t prop;
+    int dev;
+
+    hipGetDevice(&dev);
+    printf("ID of current HIP device:  %d\n", dev);
+
+    memset(&prop, 0, sizeof(hipDeviceProp_t));
+    prop.major = 1;
+    prop.minor = 3;
+    hipChooseDevice(&dev, &prop);
+    printf("ID of hip device closest to revision 1.3:  %d\n", dev);
+
+    hipSetDevice(dev);
+
+    passed();
+}
@@ -1,161 +1,161 @@
-/* HIT_START
- * BUILD: %t %s ../test_common.cpp
- * TEST: %t
- * HIT_END
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <vector>
-
-#include <hip/hip_runtime.h>
-#include "test_common.h"
-
-// Height Width Vector
-std::vector<unsigned int> hw_vector = {2048, 1024, 512, 256, 64};
-std::vector<unsigned int> mip_vector = {8, 4, 2, 1};
-
-__global__ void tex2DKernel(float* outputData, hipTextureObject_t textureObject, int width,
-                            int height, float level) {
-  int x = blockIdx.x * blockDim.x + threadIdx.x;
-  int y = blockIdx.y * blockDim.y + threadIdx.y;
-  outputData[y * width + x] = tex2DLod<float>(textureObject, x, y, level);
-}
-
-bool runMipMapTest(unsigned int width, unsigned int height, unsigned int mipmap_level) {
-  bool testResult = true;
-
-  printf("Width: %u Height: %u mip: %u \n", width, height, mipmap_level);
-
-  // Create new width & height to be tested
-  unsigned int orig_width = width;
-  unsigned int orig_height = height;
-  width /= pow(2, mipmap_level);
-  height /= pow(2, mipmap_level);
-  unsigned int size = width * height * sizeof(float);
-
-
-  float* hData = (float*)malloc(size);
-  memset(hData, 0, size);
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      hData[i * width + j] = i * width + j;
-    }
-  }
-  printf("hData: ");
-  for (int i = 0; i < 64; i++) {
-    printf("%f  ", hData[i]);
-    if (i % width == 0) {
-      printf("\n");
-    }
-  }
-  printf("\n");
-
-  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(32, 0, 0, 0, hipChannelFormatKindFloat);
-  HIP_ARRAY3D_DESCRIPTOR mipmapped_array_desc;
-  memset(&mipmapped_array_desc, 0x00, sizeof(HIP_ARRAY3D_DESCRIPTOR));
-  mipmapped_array_desc.Width = orig_width;
-  mipmapped_array_desc.Height = orig_height;
-  mipmapped_array_desc.Depth = 0;
-  mipmapped_array_desc.Format = HIP_AD_FORMAT_FLOAT;
-  mipmapped_array_desc.NumChannels = ((channelDesc.x != 0) + (channelDesc.y != 0)
-                                      + (channelDesc.z != 0) + (channelDesc.w != 0));
-  mipmapped_array_desc.Flags = 0;
-
-
-  hipMipmappedArray* mip_array_ptr;
-  hipMipmappedArrayCreate(&mip_array_ptr, &mipmapped_array_desc, 2 * mipmap_level);
-
-  hipArray *hipArray = nullptr;
-  HIPCHECK(hipMipmappedArrayGetLevel(&hipArray, mip_array_ptr, mipmap_level));
-  HIPCHECK(hipMemcpyToArray(hipArray, 0, 0, hData, size, hipMemcpyHostToDevice));
-
-  hipResourceDesc resDesc;
-  memset(&resDesc, 0, sizeof(resDesc));
-  resDesc.resType = hipResourceTypeArray;
-  resDesc.res.array.array = hipArray;
-
-  // Specify texture object parameters
-  hipTextureDesc texDesc;
-  memset(&texDesc, 0, sizeof(texDesc));
-  texDesc.addressMode[0] = hipAddressModeWrap;
-  texDesc.addressMode[1] = hipAddressModeWrap;
-  texDesc.filterMode = hipFilterModePoint;
-  texDesc.readMode = hipReadModeElementType;
-  texDesc.normalizedCoords = 0;
-
-  // Create texture object
-  hipTextureObject_t textureObject = 0;
-  hipCreateTextureObject(&textureObject, &resDesc, &texDesc, NULL);
-
-  float* dData = NULL;
-  hipMalloc((void**)&dData, size);
-
-  dim3 dimBlock(16, 16, 1);
-  dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
-
-  hipLaunchKernelGGL(tex2DKernel, dim3(dimGrid), dim3(dimBlock), 0, 0, dData, textureObject,
-                     width, height, (2 * mipmap_level));
-
-  hipDeviceSynchronize();
-
-  float* hOutputData = (float*)malloc(size);
-  memset(hOutputData, 0, size);
-  hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost);
-
-  printf("dData: ");
-  for (int i = 0; i < 64; i++) {
-    printf("%f  ", hOutputData[i]);
-    if (i % width == 0) {
-      printf("\n");
-    }
-  }
-  printf("\n");
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      if (hData[i * width + j] != hOutputData[i * width + j]) {
-        printf("Difference [ %d %d ]:%f ----%f\n", i, j, hData[i * width + j],
-                hOutputData[i * width + j]);
-        testResult = false;
-        break;
-      }
-    }
-  }
-  hipDestroyTextureObject(textureObject);
-  hipFree(dData);
-  hipFreeArray(hipArray);
-  return testResult;
-}
-
-
-bool runTest(int argc, char** argv) {
-  bool testResult = true;
-
-  for (auto& hw: hw_vector) {
-    for (auto& mip: mip_vector) {
-      if ((hw / static_cast<int>(pow (2,(mip * 2)))) > 0) {
-        testResult |= runMipMapTest(hw, hw, mip);
-      }
-    }
-  }
-
-  printf("\n");
-  return testResult;
-}
-
-int main(int argc, char** argv) {
-  bool testResult = true;
-
-#ifdef _WIN32
-  testResult = runTest(argc, argv);
-#else
-  std::cout<<"Mipmaps are Supported only on windows, skipping the test"<<std::endl;
-#endif
-
-  if (testResult) {
-      passed();
-  } else {
-      exit(EXIT_FAILURE);
-  }
-}
-
+/* HIT_START
+ * BUILD: %t %s ../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <vector>
+
+#include <hip/hip_runtime.h>
+#include "test_common.h"
+
+// Height Width Vector
+std::vector<unsigned int> hw_vector = {2048, 1024, 512, 256, 64};
+std::vector<unsigned int> mip_vector = {8, 4, 2, 1};
+
+__global__ void tex2DKernel(float* outputData, hipTextureObject_t textureObject, int width,
+                            int height, float level) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int y = blockIdx.y * blockDim.y + threadIdx.y;
+  outputData[y * width + x] = tex2DLod<float>(textureObject, x, y, level);
+}
+
+bool runMipMapTest(unsigned int width, unsigned int height, unsigned int mipmap_level) {
+  bool testResult = true;
+
+  printf("Width: %u Height: %u mip: %u \n", width, height, mipmap_level);
+
+  // Create new width & height to be tested
+  unsigned int orig_width = width;
+  unsigned int orig_height = height;
+  width /= pow(2, mipmap_level);
+  height /= pow(2, mipmap_level);
+  unsigned int size = width * height * sizeof(float);
+
+
+  float* hData = (float*)malloc(size);
+  memset(hData, 0, size);
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      hData[i * width + j] = i * width + j;
+    }
+  }
+  printf("hData: ");
+  for (int i = 0; i < 64; i++) {
+    printf("%f  ", hData[i]);
+    if (i % width == 0) {
+      printf("\n");
+    }
+  }
+  printf("\n");
+
+  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(32, 0, 0, 0, hipChannelFormatKindFloat);
+  HIP_ARRAY3D_DESCRIPTOR mipmapped_array_desc;
+  memset(&mipmapped_array_desc, 0x00, sizeof(HIP_ARRAY3D_DESCRIPTOR));
+  mipmapped_array_desc.Width = orig_width;
+  mipmapped_array_desc.Height = orig_height;
+  mipmapped_array_desc.Depth = 0;
+  mipmapped_array_desc.Format = HIP_AD_FORMAT_FLOAT;
+  mipmapped_array_desc.NumChannels = ((channelDesc.x != 0) + (channelDesc.y != 0)
+                                      + (channelDesc.z != 0) + (channelDesc.w != 0));
+  mipmapped_array_desc.Flags = 0;
+
+
+  hipMipmappedArray* mip_array_ptr;
+  hipMipmappedArrayCreate(&mip_array_ptr, &mipmapped_array_desc, 2 * mipmap_level);
+
+  hipArray *hipArray = nullptr;
+  HIPCHECK(hipMipmappedArrayGetLevel(&hipArray, mip_array_ptr, mipmap_level));
+  HIPCHECK(hipMemcpyToArray(hipArray, 0, 0, hData, size, hipMemcpyHostToDevice));
+
+  hipResourceDesc resDesc;
+  memset(&resDesc, 0, sizeof(resDesc));
+  resDesc.resType = hipResourceTypeArray;
+  resDesc.res.array.array = hipArray;
+
+  // Specify texture object parameters
+  hipTextureDesc texDesc;
+  memset(&texDesc, 0, sizeof(texDesc));
+  texDesc.addressMode[0] = hipAddressModeWrap;
+  texDesc.addressMode[1] = hipAddressModeWrap;
+  texDesc.filterMode = hipFilterModePoint;
+  texDesc.readMode = hipReadModeElementType;
+  texDesc.normalizedCoords = 0;
+
+  // Create texture object
+  hipTextureObject_t textureObject = 0;
+  hipCreateTextureObject(&textureObject, &resDesc, &texDesc, NULL);
+
+  float* dData = NULL;
+  hipMalloc((void**)&dData, size);
+
+  dim3 dimBlock(16, 16, 1);
+  dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
+
+  hipLaunchKernelGGL(tex2DKernel, dim3(dimGrid), dim3(dimBlock), 0, 0, dData, textureObject,
+                     width, height, (2 * mipmap_level));
+
+  hipDeviceSynchronize();
+
+  float* hOutputData = (float*)malloc(size);
+  memset(hOutputData, 0, size);
+  hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost);
+
+  printf("dData: ");
+  for (int i = 0; i < 64; i++) {
+    printf("%f  ", hOutputData[i]);
+    if (i % width == 0) {
+      printf("\n");
+    }
+  }
+  printf("\n");
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      if (hData[i * width + j] != hOutputData[i * width + j]) {
+        printf("Difference [ %d %d ]:%f ----%f\n", i, j, hData[i * width + j],
+                hOutputData[i * width + j]);
+        testResult = false;
+        break;
+      }
+    }
+  }
+  hipDestroyTextureObject(textureObject);
+  hipFree(dData);
+  hipFreeArray(hipArray);
+  return testResult;
+}
+
+
+bool runTest(int argc, char** argv) {
+  bool testResult = true;
+
+  for (auto& hw: hw_vector) {
+    for (auto& mip: mip_vector) {
+      if ((hw / static_cast<int>(pow (2,(mip * 2)))) > 0) {
+        testResult |= runMipMapTest(hw, hw, mip);
+      }
+    }
+  }
+
+  printf("\n");
+  return testResult;
+}
+
+int main(int argc, char** argv) {
+  bool testResult = true;
+
+#ifdef _WIN32
+  testResult = runTest(argc, argv);
+#else
+  std::cout<<"Mipmaps are Supported only on windows, skipping the test"<<std::endl;
+#endif
+
+  if (testResult) {
+      passed();
+  } else {
+      exit(EXIT_FAILURE);
+  }
+}
+