From 69ae390cc3ff8a3c01dfbeddb88980567bfdd525 Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Wed, 29 Jul 2020 21:26:10 -0400 Subject: [PATCH] [HipPerf] add two subtests for measuring maximum device memory read/write speed SWDEV-245290 / SWDEV-246220 [HIPPerf] Port OCLPerfDevMemWriteSpeed/OCLPerfDevMemReadSpeed into hip performance subtests Change-Id: I5dc323c75cebbc17596dcb4ed9492e18c5246868 [ROCm/hip-tests commit: 87558f64482bb685437630119146124b5cea1080] --- .../memory/hipPerfDevMemReadSpeed.cpp | 165 ++++++++++++++++++ .../memory/hipPerfDevMemWriteSpeed.cpp | 155 ++++++++++++++++ 2 files changed, 320 insertions(+) create mode 100644 projects/hip-tests/perftests/memory/hipPerfDevMemReadSpeed.cpp create mode 100644 projects/hip-tests/perftests/memory/hipPerfDevMemWriteSpeed.cpp diff --git a/projects/hip-tests/perftests/memory/hipPerfDevMemReadSpeed.cpp b/projects/hip-tests/perftests/memory/hipPerfDevMemReadSpeed.cpp new file mode 100644 index 0000000000..f740d50ace --- /dev/null +++ b/projects/hip-tests/perftests/memory/hipPerfDevMemReadSpeed.cpp @@ -0,0 +1,165 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * TEST: %t + * HIT_END + */ + +#include +#include +#include "test_common.h" + +using namespace std; + +#define arraySize 16 + +typedef struct d_uint16 { + uint data[arraySize]; +} d_uint16; + +__global__ void read_kernel(d_uint16 *src, ulong N, uint *dst) { + + size_t idx = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x ; + + uint tmp = 0; + for (size_t i = idx; i < N; i += stride) { + for (size_t j = 0; j < arraySize; j++) { + tmp += src[i].data[j]; + } + } + + atomicAdd(dst, tmp); +} + +int main(int argc, char* argv[]) { + d_uint16 *dSrc; + d_uint16 *hSrc; + uint *dDst; + uint *hDst; + hipStream_t stream; + ulong N = 4 * 1024 * 1024; + uint nBytes = N * sizeof(d_uint16); + + int nGpu = 0; + HIPCHECK(hipGetDeviceCount(&nGpu)); + if (nGpu < 1) { + cout << "info: didn't find any GPU! skipping the test!\n"; + passed(); + return 0; + } + + static int device = 0; + HIPCHECK(hipSetDevice(device)); + hipDeviceProp_t props; + HIPCHECK(hipGetDeviceProperties(&props, device)); + cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name << + " with " << props.multiProcessorCount << " CUs" << endl; + + const unsigned threadsPerBlock = 64; + const unsigned blocks = props.multiProcessorCount * 4; + + uint inputData = 0x1; + int nIter = 1000; + + hSrc = new d_uint16[nBytes]; + HIPCHECK(hSrc == 0 ? hipErrorOutOfMemory : hipSuccess); + hDst = new uint; + hDst[0] = 0; + HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess); + for (size_t i = 0; i < N; i++) { + for (int j = 0; j < arraySize; j++) { + hSrc[i].data[j] = inputData; + } + } + + HIPCHECK(hipMalloc(&dSrc, nBytes)); + HIPCHECK(hipMalloc(&dDst, sizeof(uint))); + + HIPCHECK(hipStreamCreate(&stream)); + + HIPCHECK(hipMemcpy(dSrc, hSrc, nBytes, hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(dDst, hDst, sizeof(uint), hipMemcpyHostToDevice)); + + cout << "info: warm up launch for 'read_kernel' on the stream " << stream << endl; + + hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst); + HIPCHECK(hipMemcpy(hDst, dDst, sizeof(uint), hipMemcpyDeviceToHost)); + hipDeviceSynchronize(); + + if (hDst[0] != (nBytes / sizeof(uint))) { + cout << "info: Data validation failed for warm up run!" << endl; + cout << "info: expected " << nBytes / sizeof(uint) << " got " << hDst[0] << endl; + HIPCHECK(hipErrorUnknown); + } + + cout << "info: data validated for warm up launch for 'read_kernel'" << endl; + cout << "info: launching 'read_kernel' on the stream " << stream << " for "<< nIter << " iterations"<< endl; + + // measure performance based on host time + auto all_start = chrono::steady_clock::now(); + + for(int i = 0; i < nIter; i++) { + hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst); + } + hipDeviceSynchronize(); + + auto all_end = chrono::steady_clock::now(); + chrono::duration all_kernel_time = all_end - all_start; + + // read speed in GB/s + double perf = ((double)nBytes * nIter * (double)(1e-09)) / all_kernel_time.count(); + + cout << "info: average read speed of " << perf << " GB/s " << "achieved for memory size of " << + nBytes / (1024 * 1024) << " MB, calculated based on host time" << endl; + + // measure performance based on events time + hipEvent_t start, stop; + HIPCHECK(hipEventCreate(&start)); + HIPCHECK(hipEventCreate(&stop)); + float allEventMs = 0; + for(int i = 0; i < nIter; i++) { + HIPCHECK(hipEventRecord(start, NULL)); + + hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst); + + HIPCHECK(hipEventRecord(stop, NULL)); + HIPCHECK(hipEventSynchronize(stop)); + + float eventMs = 1.0f; + HIPCHECK(hipEventElapsedTime(&eventMs, start, stop)); + + allEventMs += eventMs; + + } + + double perfe = ((double)nBytes * nIter * (double)(1e-06)) / allEventMs; + cout << "info: average read speed of " << perfe << " GB/s " << "achieved for memory size of " << + nBytes / (1024 * 1024) << " MB, calculated based on events time" << endl; + + delete [] hSrc; + delete hDst; + hipFree(dSrc); + hipFree(dDst); + HIPCHECK(hipStreamDestroy(stream)); + + passed(); +} diff --git a/projects/hip-tests/perftests/memory/hipPerfDevMemWriteSpeed.cpp b/projects/hip-tests/perftests/memory/hipPerfDevMemWriteSpeed.cpp new file mode 100644 index 0000000000..9760a161c7 --- /dev/null +++ b/projects/hip-tests/perftests/memory/hipPerfDevMemWriteSpeed.cpp @@ -0,0 +1,155 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * TEST: %t + * HIT_END + */ + +#include +#include +#include "test_common.h" + +using namespace std; + +#define arraySize 16 + +typedef struct d_uint16 { + uint data[arraySize]; +} d_uint16; + +__global__ void write_kernel(d_uint16 *dst, ulong N, d_uint16 pval) { + size_t idx = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + for (size_t i = idx; i < N; i += stride) { + dst[i] = pval; + } +}; + +int main(int argc, char* argv[]) { + d_uint16 *dDst; + d_uint16 *hDst; + hipStream_t stream; + ulong N = 4 * 1024 * 1024; + uint nBytes = N * sizeof(d_uint16); + d_uint16 pval; + + for (int i = 0; i < arraySize; i++) { + pval.data[i] = 0xabababab; + } + + int nGpu = 0; + HIPCHECK(hipGetDeviceCount(&nGpu)); + if (nGpu < 1) { + cout << "info: didn't find any GPU! skipping the test!\n"; + passed(); + return 0; + } + + static int device = 0; + HIPCHECK(hipSetDevice(device)); + hipDeviceProp_t props; + HIPCHECK(hipGetDeviceProperties(&props, device)); + cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name << + " with " << props.multiProcessorCount << " CUs" << endl; + + size_t threadsPerBlock = 64; + size_t blocks = props.multiProcessorCount * 4; + + uint inputData = 0xabababab; + int nIter = 1000; + + hDst = new d_uint16[nBytes]; + HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess); + for (size_t i = 0; i < N; i++) { + for (size_t j = 0; j < arraySize; j++) { + hDst[i].data[j] = 0; + } + } + + HIPCHECK(hipMalloc(&dDst, nBytes)); + + HIPCHECK(hipStreamCreate(&stream)); + + + cout << "info: warm up launch for 'write_kernel' on the stream " << stream << endl; + + hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval); + HIPCHECK(hipMemcpy(hDst, dDst, nBytes , hipMemcpyDeviceToHost)); + hipDeviceSynchronize(); + + for (uint i = 0; i < N; i++) { + for (uint j = 0; j < arraySize; j++) { + if (hDst[i].data[j] != inputData) { + cout << "info: Data validation failed for warm up run! " << endl; + cout << "at index i: " << i << " element j: " << j << endl; + cout << hex << "expected 0x" << inputData << " but got 0x" << hDst[i].data[j] << endl; + HIPCHECK(hipErrorUnknown); + } + } + } + + cout << "info: data validated for warm up launch for 'write_kernel" << endl; + cout << "info: launching 'write_kernel' on the stream " << stream << " for "<< nIter << " iterations"<< endl; + + auto all_start = chrono::steady_clock::now(); + for(int i = 0; i < nIter; i++) { + hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval); + } + hipDeviceSynchronize(); + auto all_end = chrono::steady_clock::now(); + chrono::duration all_kernel_time = all_end - all_start; + + // read speed in GB/s + double perf = ((double)nBytes * nIter * (double)(1e-09)) / all_kernel_time.count(); + + cout << "info: average write speed of " << perf << " GB/s " << "achieved for memory size of " << + nBytes / (1024 * 1024) << " MB, calculated based on host time" << endl; + + // measure performance based on events time + hipEvent_t start, stop; + HIPCHECK(hipEventCreate(&start)); + HIPCHECK(hipEventCreate(&stop)); + float allEventMs = 0; + for(int i = 0; i < nIter; i++) { + HIPCHECK(hipEventRecord(start, NULL)); + + hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval); + + HIPCHECK(hipEventRecord(stop, NULL)); + HIPCHECK(hipEventSynchronize(stop)); + + float eventMs = 1.0f; + HIPCHECK(hipEventElapsedTime(&eventMs, start, stop)); + + allEventMs += eventMs; + + } + + double perfe = ((double)nBytes * nIter * (double)(1e-06)) / allEventMs; + cout << "info: average write speed of " << perfe << " GB/s " << "achieved for memory size of " << + nBytes / (1024 * 1024) << " MB, calculated based on events time" << endl; + + delete [] hDst; + hipFree(dDst); + HIPCHECK(hipStreamDestroy(stream)); + + passed(); +}