cf174d5a47
Change-Id: I13d2513f31dffe0b280039c888a97cc0d7bba31f
263 wiersze
8.3 KiB
C++
263 wiersze
8.3 KiB
C++
/*
|
||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||
of this software and associated documentation files (the "Software"), to deal
|
||
in the Software without restriction, including without limitation the rights
|
||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||
copies of the Software, and to permit persons to whom the Software is
|
||
furnished to do so, subject to the following conditions:
|
||
The above copyright notice and this permission notice shall be included in
|
||
all copies or substantial portions of the Software.
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||
THE SOFTWARE.
|
||
*/
|
||
|
||
/**
|
||
* @addtogroup hipMemcpyKernel hipMemcpyKernel
|
||
* @{
|
||
* @ingroup perfMemoryTest
|
||
* `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` -
|
||
* Copies data between host and device.
|
||
*/
|
||
|
||
#include <hip_test_common.hh>
|
||
|
||
#define sharedMemSize1 2048
|
||
#define sharedMemSize2 256
|
||
|
||
__global__ void sharedMemReadSpeed1(float *outBuf, ulong N) {
|
||
size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||
size_t lid = threadIdx.x;
|
||
__shared__ float local[sharedMemSize1];
|
||
|
||
float val1 = 0;
|
||
float val2 = 0;
|
||
float val3 = 0;
|
||
float val4 = 0;
|
||
|
||
for (int i = 0; i < (sharedMemSize1 / 64); i++) {
|
||
local[lid + i * 64] = lid;
|
||
}
|
||
|
||
__syncthreads();
|
||
|
||
val1 += local[lid];
|
||
val2 += local[lid + 64];
|
||
val3 += local[lid + 128];
|
||
val4 += local[lid + 192];
|
||
val1 += local[lid + 256];
|
||
val2 += local[lid + 320];
|
||
val3 += local[lid + 384];
|
||
val4 += local[lid + 448];
|
||
val1 += local[lid + 512];
|
||
val2 += local[lid + 576];
|
||
val3 += local[lid + 640];
|
||
val4 += local[lid + 704];
|
||
val1 += local[lid + 768];
|
||
val2 += local[lid + 832];
|
||
val3 += local[lid + 896];
|
||
val4 += local[lid + 960];
|
||
val1 += local[lid + 1024];
|
||
val2 += local[lid + 1088];
|
||
val3 += local[lid + 1152];
|
||
val4 += local[lid + 1216];
|
||
val1 += local[lid + 1280];
|
||
val2 += local[lid + 1344];
|
||
val3 += local[lid + 1408];
|
||
val4 += local[lid + 1472];
|
||
val1 += local[lid + 1536];
|
||
val2 += local[lid + 1600];
|
||
val3 += local[lid + 1664];
|
||
val4 += local[lid + 1728];
|
||
val1 += local[lid + 1792];
|
||
val2 += local[lid + 1856];
|
||
val3 += local[lid + 1920];
|
||
val4 += local[lid + 1984];
|
||
|
||
if (gid < N) {
|
||
outBuf[gid] = val1 + val2 + val3 + val4;
|
||
}
|
||
}
|
||
|
||
__global__ void sharedMemReadSpeed2(float *outBuf, ulong N) {
|
||
size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||
size_t lid = threadIdx.x;
|
||
__shared__ float local[sharedMemSize2];
|
||
|
||
float val0 = 0.0f;
|
||
float val1 = 0.0f;
|
||
|
||
for (int i = 0; i < (sharedMemSize2 / 64); i++) {
|
||
local[lid + i * 64] = lid;
|
||
}
|
||
|
||
__syncthreads();
|
||
|
||
#pragma nounroll
|
||
for (uint i = 0; i < 32; i++) {
|
||
val0 += local[8 * i + 0];
|
||
val1 += local[8 * i + 1];
|
||
val0 += local[8 * i + 2];
|
||
val1 += local[8 * i + 3];
|
||
val0 += local[8 * i + 4];
|
||
val1 += local[8 * i + 5];
|
||
val0 += local[8 * i + 6];
|
||
val1 += local[8 * i + 7];
|
||
}
|
||
|
||
if (gid < N) {
|
||
outBuf[gid] = val0 + val1;
|
||
}
|
||
}
|
||
|
||
static bool hipPerfSharedMemReadSpeed_test() {
|
||
float *dDst;
|
||
float *hDst;
|
||
hipStream_t stream;
|
||
constexpr uint numSizes = 4;
|
||
constexpr uint Sizes[numSizes] = {262144, 1048576, 4194304, 16777216};
|
||
uint numReads1 = 32;
|
||
uint numReads2 = 256;
|
||
uint sharedMemSizeBytes1 = sharedMemSize1 * sizeof(float);
|
||
uint sharedMemSizeBytes2 = sharedMemSize2 * sizeof(float);
|
||
int nIter = 1000;
|
||
const unsigned threadsPerBlock = 64;
|
||
|
||
static int device = 0;
|
||
HIP_CHECK(hipSetDevice(device));
|
||
hipDeviceProp_t props;
|
||
HIP_CHECK(hipGetDeviceProperties(&props, device));
|
||
INFO("info: running on bus " << "0x" << props.pciBusID << " " << props.name
|
||
<< " with " << props.multiProcessorCount << " CUs \n");
|
||
|
||
HIP_CHECK(hipStreamCreate(&stream));
|
||
|
||
for (int nTest = 0; nTest < numSizes; nTest++) {
|
||
uint nBytes = Sizes[nTest % numSizes];
|
||
ulong N = nBytes / sizeof(float);
|
||
const unsigned blocks = N / threadsPerBlock;
|
||
|
||
hDst = new float[nBytes];
|
||
HIP_CHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess);
|
||
memset(hDst, 0, nBytes);
|
||
|
||
HIP_CHECK(hipMalloc(&dDst, nBytes));
|
||
HIP_CHECK(hipMemcpy(dDst, hDst, nBytes, hipMemcpyHostToDevice));
|
||
|
||
hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks),
|
||
dim3(threadsPerBlock), 0, stream, dDst, N);
|
||
HIP_CHECK(hipMemcpy(hDst, dDst, nBytes, hipMemcpyDeviceToHost));
|
||
HIP_CHECK(hipDeviceSynchronize());
|
||
|
||
int tmp = 0;
|
||
for (int i = 0; i < N; i++) {
|
||
if (i % threadsPerBlock == 0) {
|
||
tmp = 0;
|
||
}
|
||
if (hDst[i] != tmp) {
|
||
INFO("info: Data validation failed for warm up run! \n");
|
||
INFO("info: expected " << tmp << " got " << hDst[i] << " \n");
|
||
return false;
|
||
}
|
||
tmp += threadsPerBlock / 2;
|
||
}
|
||
|
||
auto all_start = std::chrono::steady_clock::now();
|
||
for (int i = 0; i < nIter; i++) {
|
||
hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks),
|
||
dim3(threadsPerBlock), 0, stream, dDst, N);
|
||
}
|
||
HIP_CHECK(hipDeviceSynchronize());
|
||
|
||
auto all_end = std::chrono::steady_clock::now();
|
||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||
|
||
// read speed in GB/s
|
||
double perf = (static_cast<double>(blocks * threadsPerBlock)
|
||
* (numReads1 * sizeof(float) + sharedMemSizeBytes1 / 64)
|
||
* nIter * (1e-09)) / all_kernel_time.count();
|
||
|
||
INFO("info: read speed = " << std::setw(8) << perf << " GB/s for " <<
|
||
sharedMemSizeBytes1 / 1024 << " KB shared memory with " <<
|
||
std::setw(8) << blocks * threadsPerBlock << " threads, "
|
||
<< std::setw(4) << numReads1 <<
|
||
" reads in sharedMemReadSpeed1 kernel \n");
|
||
|
||
delete[] hDst;
|
||
HIP_CHECK(hipFree(dDst));
|
||
}
|
||
|
||
for (int nTest = 0; nTest < numSizes; nTest++) {
|
||
uint nBytes = Sizes[nTest % numSizes];
|
||
ulong N = nBytes / sizeof(float);
|
||
const unsigned blocks = N / threadsPerBlock;
|
||
|
||
hDst = new float[nBytes];
|
||
HIP_CHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess);
|
||
memset(hDst, 0, nBytes);
|
||
|
||
HIP_CHECK(hipMalloc(&dDst, nBytes));
|
||
HIP_CHECK(hipMemcpy(dDst, hDst, nBytes, hipMemcpyHostToDevice));
|
||
|
||
hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks),
|
||
dim3(threadsPerBlock), 0, stream, dDst, N);
|
||
HIP_CHECK(hipMemcpy(hDst, dDst, nBytes, hipMemcpyDeviceToHost));
|
||
HIP_CHECK(hipDeviceSynchronize());
|
||
|
||
auto all_start = std::chrono::steady_clock::now();
|
||
for (int i = 0; i < nIter; i++) {
|
||
hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks),
|
||
dim3(threadsPerBlock), 0, stream, dDst, N);
|
||
}
|
||
HIP_CHECK(hipDeviceSynchronize());
|
||
|
||
auto all_end = std::chrono::steady_clock::now();
|
||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||
|
||
// read speed in GB/s
|
||
double perf = (static_cast<double>(blocks * threadsPerBlock)
|
||
* (numReads2 * sizeof(float) + sharedMemSizeBytes2 / 64)
|
||
* nIter * (1e-09)) / all_kernel_time.count();
|
||
|
||
INFO("info: read speed = " << std::setw(8) << perf << " GB/s for "
|
||
<< sharedMemSizeBytes2 / 1024 << " KB shared memory with "
|
||
<< std::setw(8) << blocks * threadsPerBlock << " threads, "
|
||
<< std::setw(4) << numReads2 <<
|
||
" reads in sharedMemReadSpeed2 kernel \n");
|
||
|
||
delete[] hDst;
|
||
HIP_CHECK(hipFree(dDst));
|
||
}
|
||
HIP_CHECK(hipStreamDestroy(stream));
|
||
return true;
|
||
}
|
||
|
||
/**
|
||
* Test Description
|
||
* ------------------------
|
||
* - Verify hipPerfSharedMemReadSpeed status.
|
||
* Test source
|
||
* ------------------------
|
||
* - perftests/memory/hipPerfSharedMemReadSpeed.cc
|
||
* Test requirements
|
||
* ------------------------
|
||
* - HIP_VERSION >= 5.6
|
||
*/
|
||
|
||
TEST_CASE("Perf_hipPerfSharedMemReadSpeed_test") {
|
||
int numDevices = 0;
|
||
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
||
|
||
if (numDevices <= 0) {
|
||
SUCCEED("Skipped testcase hipPerfSharedMemReadSpeed as"
|
||
"there is no device to test.\n");
|
||
} else {
|
||
REQUIRE(true == hipPerfSharedMemReadSpeed_test());
|
||
}
|
||
}
|