From 6ef3a48e27769959cdc5365c5c88dcd0110f18b6 Mon Sep 17 00:00:00 2001
From: Sourabh Betigeri <sourabh.betigeri@amd.com>
Date: Sat, 12 Sep 2020 01:00:17 -0700
Subject: [PATCH] Added performance test for Device concurrency

Change-Id: I30f4e65467613152ce86b80b13e1c9f2928da801
---
 perftests/stream/hipPerfDeviceConcurrency.cpp | 289 ++++++++++++++++++
 1 file changed, 289 insertions(+)
 create mode 100644 perftests/stream/hipPerfDeviceConcurrency.cpp
diff --git a/perftests/stream/hipPerfDeviceConcurrency.cpp b/perftests/stream/hipPerfDeviceConcurrency.cpp
new file mode 100644
index 0000000000..7d6699a9a2
--- /dev/null
+++ b/perftests/stream/hipPerfDeviceConcurrency.cpp
@@ -0,0 +1,289 @@
+/*
+ Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved.
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ */
+
+/* HIT_START
+ * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <iostream>
+#include <chrono>
+#include "test_common.h"
+
+typedef struct {
+  double x;
+  double y;
+  double width;
+} coordRec;
+
+static coordRec coords[] = {
+    {0.0, 0.0, 0.00001},         // All black
+};
+
+static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
+
+__global__ void mandelbrot(uint *out, uint width, float xPos,  float yPos, float xStep,
+                            float yStep, uint maxIter) {
+
+  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
+  int i = tid % width;
+  int j = tid / width;
+  float x0 = (float)(xPos + xStep*i);
+  float y0 = (float)(yPos + yStep*j);
+
+  float x = x0;
+  float y = y0;
+
+  uint iter = 0;
+  float tmp;
+  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
+    tmp = x;
+    x = fma(-y,y,fma(x,x,x0));
+    y = fma(2.0f*tmp,y,y0);
+  }
+
+  out[tid] = iter;
+};
+
+class hipPerfDeviceConcurrency {
+  public:
+  hipPerfDeviceConcurrency();
+  ~hipPerfDeviceConcurrency();
+
+  void setNumGpus(unsigned int num) {
+    numDevices = num;
+  }
+  unsigned int getNumGpus() {
+    return numDevices;
+  }
+
+  void open(void);
+  void close(void);
+  void run(unsigned int testCase, int numGpus);
+
+  private:
+  void setData(void *ptr, unsigned int value);
+  void checkData(uint *ptr);
+
+  unsigned int numDevices;
+  unsigned int width_;
+  unsigned int bufSize;
+  unsigned int coordIdx;
+  unsigned long long totalIters = 0;
+};
+
+
+hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {}
+
+hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {}
+
+void hipPerfDeviceConcurrency::open(void) {
+
+
+  int nGpu = 0;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+  setNumGpus(nGpu);
+  if (nGpu < 1) {
+  std::cout << "info: didn't find any GPU! skipping the test!\n";
+  passed();
+  }
+
+
+}
+
+
+void hipPerfDeviceConcurrency::close() {
+}
+
+void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
+
+
+  static int deviceId;
+  uint * hPtr[numGpus];
+  uint * dPtr[numGpus];
+  hipStream_t streams[numGpus];
+  int numCUs[numGpus];
+  unsigned int maxIter[numGpus];
+  unsigned long long expectedIters[numGpus];
+
+  int threads, threads_per_block, blocks;
+  float xStep, yStep, xPos, yPos;
+
+  for(int i = 0; i < numGpus; i++) {
+
+  if(testCase != 0) {
+    deviceId = i;
+  }
+
+  HIPCHECK(hipSetDevice(deviceId));
+
+  hipDeviceProp_t props = {0};
+  HIPCHECK(hipGetDeviceProperties(&props, i));
+
+  if (testCase != 0) {
+  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
+            << " with " << props.multiProcessorCount << " CUs" << " and device ID: "
+            << i << std::endl;
+  }
+
+  numCUs[i] = props.multiProcessorCount;
+  int clkFrequency = 0;
+  HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i));
+
+  clkFrequency =(unsigned int)clkFrequency/1000;
+
+  // Maximum iteration count
+  // maxIter = 8388608 * (engine_clock / 1000).serial execution
+  maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128);
+  maxIter[i] = (maxIter[i] + 15) & ~15;
+
+  // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
+  width_ = 256;
+
+  bufSize = width_ * width_ * sizeof(uint);
+
+  // Create streams for concurrency
+  HIPCHECK(hipStreamCreate(&streams[i]));
+
+  // Allocate memory on the host and device
+  HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
+  setData(hPtr[i], 0xdeadbeef);
+  HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
+
+  // Prepare kernel launch parameters
+  threads = (bufSize/sizeof(uint));
+  threads_per_block  = 64;
+  blocks = (threads/threads_per_block) + (threads % threads_per_block);
+
+  coordIdx = testCase % numCoords;
+  xStep = (float)(coords[coordIdx].width / (double)width_);
+  yStep = (float)(-coords[coordIdx].width / (double)width_);
+  xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
+  yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
+
+  // Copy memory from host to device
+  HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
+
+  }
+
+  // Time the kernel execution
+  auto all_start = std::chrono::steady_clock::now();
+
+  for(int i = 0; i < numGpus; i++) {
+
+  if(testCase != 0) {
+    deviceId = i;
+  }
+
+  HIPCHECK(hipSetDevice(deviceId));
+
+  hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i],
+                      dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]);
+
+  }
+
+  for(int i = 0; i < numGpus; i++) {
+    HIPCHECK(hipStreamSynchronize(0));
+  }
+
+
+  auto all_end = std::chrono::steady_clock::now();
+  std::chrono::duration<double> all_kernel_time = all_end - all_start;
+
+  for(int i = 0; i < numGpus; i++) {
+
+  if(testCase != 0) {
+    deviceId = i;
+  }
+  HIPCHECK(hipSetDevice(deviceId));
+
+  // Copy data back from device to the host
+  HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
+
+  checkData(hPtr[i]);
+  expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i];
+
+  if (testCase != 0) {
+    checkData(hPtr[i]);
+    if(totalIters != expectedIters[i]) {
+      std::cout << "Incorrect iteration count detected" << std::endl;
+    }
+  }
+
+
+  HIPCHECK(hipStreamDestroy(streams[i]));
+
+  // Free host and device memory
+  HIPCHECK(hipFree(hPtr[i]));
+  HIPCHECK(hipFree(dPtr[i]));
+  }
+
+  if (testCase != 0) {
+  std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): "
+            << all_kernel_time.count() << " (s) " << '\n' << std::endl;
+  }
+
+  if(testCase == 0) {
+    deviceId++;
+  }
+
+
+}
+
+
+void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  for (unsigned int i = 0; i < width_ * width_ ; i++) {
+      ptr2[i] = value;
+  }
+}
+
+
+void hipPerfDeviceConcurrency::checkData(uint *ptr) {
+  totalIters = 0;
+  for (unsigned int i = 0; i < width_ * width_; i++) {
+    totalIters += ptr[i];
+  }
+}
+
+
+int main(int argc, char* argv[]) {
+  hipPerfDeviceConcurrency deviceConcurrency;
+
+  deviceConcurrency.open();
+
+  int nGpu = deviceConcurrency.getNumGpus();
+
+  // testCase = 0 refers to warmup kernel run
+  int testCase = 0;
+
+  for (int i = 0; i < nGpu; i++) {
+    // Warm-up kernel on all devices
+    deviceConcurrency.run(testCase, 1);
+  }
+
+  // Time for kernel on 1 device
+  deviceConcurrency.run(++testCase, 1);
+
+  // Time for kernel on all available devices
+  deviceConcurrency.run(++testCase, nGpu);
+
+  passed();
+}