From 69ae390cc3ff8a3c01dfbeddb88980567bfdd525 Mon Sep 17 00:00:00 2001
From: Aryan Salmanpour <aryan.salmanpour@amd.com>
Date: Wed, 29 Jul 2020 21:26:10 -0400
Subject: [PATCH] [HipPerf] add two subtests for measuring maximum device
 memory read/write speed

SWDEV-245290 / SWDEV-246220 [HIPPerf] Port OCLPerfDevMemWriteSpeed/OCLPerfDevMemReadSpeed into hip performance subtests

Change-Id: I5dc323c75cebbc17596dcb4ed9492e18c5246868


[ROCm/hip-tests commit: 87558f64482bb685437630119146124b5cea1080]
---
 .../memory/hipPerfDevMemReadSpeed.cpp         | 165 ++++++++++++++++++
 .../memory/hipPerfDevMemWriteSpeed.cpp        | 155 ++++++++++++++++
 2 files changed, 320 insertions(+)
 create mode 100644 projects/hip-tests/perftests/memory/hipPerfDevMemReadSpeed.cpp
 create mode 100644 projects/hip-tests/perftests/memory/hipPerfDevMemWriteSpeed.cpp
diff --git a/projects/hip-tests/perftests/memory/hipPerfDevMemReadSpeed.cpp b/projects/hip-tests/perftests/memory/hipPerfDevMemReadSpeed.cpp
new file mode 100644
index 0000000000..f740d50ace
--- /dev/null
+++ b/projects/hip-tests/perftests/memory/hipPerfDevMemReadSpeed.cpp
@@ -0,0 +1,165 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <iostream>
+#include <chrono>
+#include "test_common.h"
+
+using namespace std;
+
+#define arraySize 16
+
+typedef struct d_uint16 {
+  uint data[arraySize];
+} d_uint16;
+
+__global__ void read_kernel(d_uint16 *src, ulong N, uint *dst) {
+
+  size_t idx = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x ;
+
+  uint tmp = 0;
+  for (size_t i = idx; i < N; i += stride) {
+    for (size_t j = 0; j < arraySize; j++) {
+      tmp += src[i].data[j];
+    }
+  }
+
+  atomicAdd(dst, tmp);
+}
+
+int main(int argc, char* argv[]) {
+  d_uint16 *dSrc;
+  d_uint16 *hSrc;
+  uint *dDst;
+  uint *hDst;
+  hipStream_t stream;
+  ulong N = 4 * 1024 * 1024;
+  uint nBytes = N * sizeof(d_uint16);
+
+  int nGpu = 0;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+  if (nGpu < 1) {
+      cout << "info: didn't find any GPU! skipping the test!\n";
+      passed();
+      return 0;
+  }
+
+  static int device = 0;
+  HIPCHECK(hipSetDevice(device));
+  hipDeviceProp_t props;
+  HIPCHECK(hipGetDeviceProperties(&props, device));
+  cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name <<
+      " with " << props.multiProcessorCount << " CUs" << endl;
+
+  const unsigned threadsPerBlock = 64;
+  const unsigned blocks = props.multiProcessorCount * 4;
+
+  uint inputData = 0x1;
+  int nIter = 1000;
+
+  hSrc =  new d_uint16[nBytes];
+  HIPCHECK(hSrc == 0 ? hipErrorOutOfMemory : hipSuccess);
+  hDst =  new uint;
+  hDst[0] = 0;
+  HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess);
+  for (size_t i = 0; i < N; i++) {
+    for (int j = 0; j < arraySize; j++) {
+      hSrc[i].data[j] = inputData;
+    }
+  }
+
+  HIPCHECK(hipMalloc(&dSrc, nBytes));
+  HIPCHECK(hipMalloc(&dDst, sizeof(uint)));
+
+  HIPCHECK(hipStreamCreate(&stream));
+
+  HIPCHECK(hipMemcpy(dSrc, hSrc, nBytes, hipMemcpyHostToDevice));
+  HIPCHECK(hipMemcpy(dDst, hDst, sizeof(uint), hipMemcpyHostToDevice));
+
+  cout << "info: warm up launch for 'read_kernel' on the stream " << stream << endl;
+
+  hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst);
+  HIPCHECK(hipMemcpy(hDst, dDst, sizeof(uint), hipMemcpyDeviceToHost));
+  hipDeviceSynchronize();
+
+  if (hDst[0] != (nBytes / sizeof(uint))) {
+    cout << "info: Data validation failed for warm up run!" << endl;
+    cout << "info: expected " << nBytes / sizeof(uint) << " got " << hDst[0] << endl;
+    HIPCHECK(hipErrorUnknown);
+  }
+
+  cout << "info: data validated for warm up launch for 'read_kernel'" << endl;
+  cout << "info: launching 'read_kernel' on the stream " << stream << " for "<< nIter << " iterations"<< endl;
+
+  // measure performance based on host time
+  auto all_start = chrono::steady_clock::now();
+
+  for(int i = 0; i < nIter; i++) {
+    hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst);
+  }
+  hipDeviceSynchronize();
+
+  auto all_end = chrono::steady_clock::now();
+  chrono::duration<double> all_kernel_time = all_end - all_start;
+
+  // read speed in GB/s
+  double perf = ((double)nBytes * nIter * (double)(1e-09)) / all_kernel_time.count();
+
+  cout << "info: average read speed of " << perf << " GB/s " << "achieved for memory size of " <<
+      nBytes / (1024 * 1024) << " MB, calculated based on host time" << endl;
+
+  // measure performance based on events time
+  hipEvent_t start, stop;
+  HIPCHECK(hipEventCreate(&start));
+  HIPCHECK(hipEventCreate(&stop));
+  float allEventMs = 0;
+  for(int i = 0; i < nIter; i++) {
+    HIPCHECK(hipEventRecord(start, NULL));
+
+    hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst);
+
+    HIPCHECK(hipEventRecord(stop, NULL));
+    HIPCHECK(hipEventSynchronize(stop));
+
+    float eventMs = 1.0f;
+    HIPCHECK(hipEventElapsedTime(&eventMs, start, stop));
+
+    allEventMs += eventMs;
+
+  }
+
+  double perfe = ((double)nBytes * nIter * (double)(1e-06)) / allEventMs;
+  cout << "info: average read speed of " << perfe << " GB/s " << "achieved for memory size of " <<
+      nBytes / (1024 * 1024) << " MB, calculated based on events time" << endl;
+
+  delete [] hSrc;
+  delete hDst;
+  hipFree(dSrc);
+  hipFree(dDst);
+  HIPCHECK(hipStreamDestroy(stream));
+
+  passed();
+}
diff --git a/projects/hip-tests/perftests/memory/hipPerfDevMemWriteSpeed.cpp b/projects/hip-tests/perftests/memory/hipPerfDevMemWriteSpeed.cpp
new file mode 100644
index 0000000000..9760a161c7
--- /dev/null
+++ b/projects/hip-tests/perftests/memory/hipPerfDevMemWriteSpeed.cpp
@@ -0,0 +1,155 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <iostream>
+#include <chrono>
+#include "test_common.h"
+
+using namespace std;
+
+#define arraySize 16
+
+typedef struct d_uint16 {
+  uint data[arraySize];
+} d_uint16;
+
+__global__ void write_kernel(d_uint16 *dst, ulong N, d_uint16 pval) {
+    size_t idx = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x;
+    for (size_t i = idx; i < N; i += stride) {
+      dst[i] = pval;
+    }
+};
+
+int main(int argc, char* argv[]) {
+  d_uint16 *dDst;
+  d_uint16 *hDst;
+  hipStream_t stream;
+  ulong N = 4 * 1024 * 1024;
+  uint nBytes = N * sizeof(d_uint16);
+  d_uint16 pval;
+
+  for (int i = 0; i < arraySize; i++) {
+    pval.data[i] = 0xabababab;
+  }
+
+  int nGpu = 0;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+  if (nGpu < 1) {
+      cout << "info: didn't find any GPU! skipping the test!\n";
+      passed();
+      return 0;
+  }
+
+  static int device = 0;
+  HIPCHECK(hipSetDevice(device));
+  hipDeviceProp_t props;
+  HIPCHECK(hipGetDeviceProperties(&props, device));
+  cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name <<
+      " with " << props.multiProcessorCount << " CUs" << endl;
+
+  size_t threadsPerBlock = 64;
+  size_t blocks = props.multiProcessorCount * 4;
+
+  uint inputData = 0xabababab;
+  int nIter = 1000;
+
+  hDst =  new d_uint16[nBytes];
+  HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess);
+  for (size_t i = 0; i < N; i++) {
+    for (size_t j = 0; j < arraySize; j++) {
+      hDst[i].data[j] = 0;
+    }
+  }
+
+  HIPCHECK(hipMalloc(&dDst, nBytes));
+
+  HIPCHECK(hipStreamCreate(&stream));
+
+
+  cout << "info: warm up launch for 'write_kernel' on the stream " << stream << endl;
+
+  hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval);
+  HIPCHECK(hipMemcpy(hDst, dDst, nBytes , hipMemcpyDeviceToHost));
+  hipDeviceSynchronize();
+
+  for (uint i = 0; i < N; i++) {
+    for (uint j = 0; j < arraySize; j++) {
+      if (hDst[i].data[j] != inputData) {
+        cout << "info: Data validation failed for warm up run! " << endl;
+        cout << "at index i: " << i << " element j: " << j << endl;
+        cout << hex << "expected 0x" << inputData << " but got 0x" << hDst[i].data[j] << endl;
+        HIPCHECK(hipErrorUnknown);
+      }
+    }
+  }
+
+  cout << "info: data validated for warm up launch for 'write_kernel" << endl;
+  cout << "info: launching 'write_kernel' on the stream " << stream << " for "<< nIter << " iterations"<< endl;
+
+  auto all_start = chrono::steady_clock::now();
+  for(int i = 0; i < nIter; i++) {
+    hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval);
+  }
+  hipDeviceSynchronize();
+  auto all_end = chrono::steady_clock::now();
+  chrono::duration<double> all_kernel_time = all_end - all_start;
+
+  // read speed in GB/s
+  double perf = ((double)nBytes * nIter * (double)(1e-09)) / all_kernel_time.count();
+
+  cout << "info: average write speed of " << perf << " GB/s " << "achieved for memory size of " <<
+      nBytes / (1024 * 1024) << " MB, calculated based on host time" << endl;
+
+  // measure performance based on events time
+  hipEvent_t start, stop;
+  HIPCHECK(hipEventCreate(&start));
+  HIPCHECK(hipEventCreate(&stop));
+  float allEventMs = 0;
+  for(int i = 0; i < nIter; i++) {
+    HIPCHECK(hipEventRecord(start, NULL));
+
+    hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval);
+
+    HIPCHECK(hipEventRecord(stop, NULL));
+    HIPCHECK(hipEventSynchronize(stop));
+
+    float eventMs = 1.0f;
+    HIPCHECK(hipEventElapsedTime(&eventMs, start, stop));
+
+    allEventMs += eventMs;
+
+  }
+
+  double perfe = ((double)nBytes * nIter * (double)(1e-06)) / allEventMs;
+  cout << "info: average write speed of " << perfe << " GB/s " << "achieved for memory size of " <<
+      nBytes / (1024 * 1024) << " MB, calculated based on events time" << endl;
+
+  delete [] hDst;
+  hipFree(dDst);
+  HIPCHECK(hipStreamDestroy(stream));
+
+  passed();
+}