2016-05-23 12:11:26 +08:00
|
|
|
/*
|
2017-02-03 10:53:36 -06:00
|
|
|
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
2016-05-23 12:11:26 +08:00
|
|
|
|
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
|
|
|
in the Software without restriction, including without limitation the rights
|
|
|
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
|
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
|
|
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
|
|
|
all copies or substantial portions of the Software.
|
|
|
|
|
|
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
|
|
THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
2016-09-23 16:15:31 +05:30
|
|
|
/* HIT_START
|
2021-01-04 19:58:20 -05:00
|
|
|
* BUILD: %t %s ../test_common.cpp
|
2020-06-09 15:45:22 -04:00
|
|
|
* TEST: %t EXCLUDE_HIP_PLATFORM nvidia
|
2016-09-23 16:15:31 +05:30
|
|
|
* HIT_END
|
|
|
|
|
*/
|
|
|
|
|
|
2016-10-04 22:20:50 +05:30
|
|
|
#include "hip/hip_runtime.h"
|
2016-05-23 12:11:26 +08:00
|
|
|
#include "test_common.h"
|
|
|
|
|
|
2020-10-08 11:56:14 -04:00
|
|
|
template <unsigned batch, typename T>
|
|
|
|
|
__device__ void sum(T* sdata, unsigned groupElements, unsigned tid) {
|
|
|
|
|
T tmp;
|
|
|
|
|
if (groupElements < batch)
|
|
|
|
|
return;
|
|
|
|
|
// sdata[tid] += sdata[tid - batch/2] does not work when block size is
|
|
|
|
|
// greater than wave size because one wave may complete before another
|
|
|
|
|
// wave.
|
|
|
|
|
if (tid >= batch/2 && tid < groupElements)
|
|
|
|
|
tmp = sdata[tid - batch/2];
|
|
|
|
|
__syncthreads();
|
|
|
|
|
if (tid >= batch/2 && tid < groupElements)
|
|
|
|
|
sdata[tid] += tmp;
|
|
|
|
|
__syncthreads();
|
|
|
|
|
}
|
|
|
|
|
|
2018-03-12 11:29:03 +05:30
|
|
|
template <typename T>
|
2018-10-17 12:01:44 +05:30
|
|
|
__global__ void testExternSharedKernel(const T* A_d, const T* B_d, T* C_d,
|
2018-03-12 11:29:03 +05:30
|
|
|
size_t numElements, size_t groupElements) {
|
2016-05-23 12:11:26 +08:00
|
|
|
// declare dynamic shared memory
|
2020-12-15 17:38:08 -05:00
|
|
|
#if defined(__HIP_PLATFORM_AMD__)
|
2016-05-23 12:11:26 +08:00
|
|
|
HIP_DYNAMIC_SHARED(T, sdata)
|
2016-09-09 12:01:41 +05:30
|
|
|
#else
|
|
|
|
|
HIP_DYNAMIC_SHARED(__align__(sizeof(T)) unsigned char, my_sdata)
|
2018-03-12 11:29:03 +05:30
|
|
|
T* sdata = reinterpret_cast<T*>(my_sdata);
|
2016-09-09 12:01:41 +05:30
|
|
|
#endif
|
2016-05-23 12:11:26 +08:00
|
|
|
|
2017-11-19 01:54:12 +00:00
|
|
|
size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
|
|
|
|
|
size_t tid = threadIdx.x;
|
2016-05-23 12:11:26 +08:00
|
|
|
|
|
|
|
|
// initialize dynamic shared memory
|
|
|
|
|
if (tid < groupElements) {
|
2017-02-03 10:53:36 -06:00
|
|
|
sdata[tid] = static_cast<T>(tid);
|
2016-05-23 12:11:26 +08:00
|
|
|
}
|
2018-03-12 11:29:03 +05:30
|
|
|
__syncthreads();
|
2016-05-23 12:11:26 +08:00
|
|
|
|
2020-10-08 11:56:14 -04:00
|
|
|
// prefix sum inside dynamic shared memory
|
|
|
|
|
sum<512>(sdata, groupElements, tid);
|
|
|
|
|
sum<256>(sdata, groupElements, tid);
|
|
|
|
|
sum<128>(sdata, groupElements, tid);
|
|
|
|
|
sum<64>(sdata, groupElements, tid);
|
|
|
|
|
sum<32>(sdata, groupElements, tid);
|
|
|
|
|
sum<16>(sdata, groupElements, tid);
|
|
|
|
|
sum<8>(sdata, groupElements, tid);
|
|
|
|
|
sum<4>(sdata, groupElements, tid);
|
|
|
|
|
sum<2>(sdata, groupElements, tid);
|
2016-05-23 12:11:26 +08:00
|
|
|
C_d[gid] = A_d[gid] + B_d[gid] + sdata[tid % groupElements];
|
|
|
|
|
}
|
|
|
|
|
|
2018-03-12 11:29:03 +05:30
|
|
|
template <typename T>
|
2016-05-23 12:11:26 +08:00
|
|
|
void testExternShared(size_t N, size_t groupElements) {
|
|
|
|
|
size_t Nbytes = N * sizeof(T);
|
|
|
|
|
|
|
|
|
|
T *A_d, *B_d, *C_d;
|
|
|
|
|
T *A_h, *B_h, *C_h;
|
|
|
|
|
|
|
|
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
2020-10-08 11:56:14 -04:00
|
|
|
unsigned blocks = N/threadsPerBlock;
|
|
|
|
|
assert(N == blocks * threadsPerBlock);
|
2016-05-23 12:11:26 +08:00
|
|
|
|
2020-10-08 11:56:14 -04:00
|
|
|
// printf("blocks: %d\nthreadsPerBlock: %d\nN: %zu\n", blocks, threadsPerBlock, N);
|
2016-05-23 12:11:26 +08:00
|
|
|
|
|
|
|
|
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
|
|
|
|
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
|
|
|
|
|
|
|
|
|
|
// calculate the amount of dynamic shared memory required
|
|
|
|
|
size_t groupMemBytes = groupElements * sizeof(T);
|
|
|
|
|
|
|
|
|
|
// launch kernel with dynamic shared memory
|
2018-10-17 12:01:44 +05:30
|
|
|
hipLaunchKernelGGL(HIP_KERNEL_NAME(testExternSharedKernel<T>), dim3(blocks), dim3(threadsPerBlock),
|
2018-03-12 11:29:03 +05:30
|
|
|
groupMemBytes, 0, A_d, B_d, C_d, N, groupElements);
|
2016-05-23 12:11:26 +08:00
|
|
|
|
|
|
|
|
HIPCHECK(hipDeviceSynchronize());
|
|
|
|
|
|
|
|
|
|
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
|
|
|
|
|
|
|
|
|
// verify
|
|
|
|
|
for (size_t i = 0; i < N; ++i) {
|
2020-10-08 11:56:14 -04:00
|
|
|
size_t tid = (i % min(threadsPerBlock, groupElements));
|
2016-05-23 12:11:26 +08:00
|
|
|
T sumFromSharedMemory = static_cast<T>(tid * (tid + 1) / 2);
|
|
|
|
|
T expected = A_h[i] + B_h[i] + sumFromSharedMemory;
|
|
|
|
|
if (C_h[i] != expected) {
|
2018-03-12 11:29:03 +05:30
|
|
|
std::cout << std::fixed << std::setprecision(32);
|
|
|
|
|
std::cout << "At " << i << std::endl;
|
|
|
|
|
std::cout << " Computed:" << C_h[i] << std::endl;
|
|
|
|
|
std::cout << " Expected:" << expected << std::endl;
|
|
|
|
|
std::cout << sumFromSharedMemory << std::endl;
|
|
|
|
|
std::cout << A_h[i] << std::endl;
|
|
|
|
|
std::cout << B_h[i] << std::endl;
|
|
|
|
|
|
|
|
|
|
failed("Failed at index:%zu\n", i);
|
2016-05-23 12:11:26 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
|
|
|
}
|
|
|
|
|
|
2018-03-12 11:29:03 +05:30
|
|
|
int main(int argc, char* argv[]) {
|
2016-05-23 12:11:26 +08:00
|
|
|
HipTest::parseStandardArguments(argc, argv, true);
|
|
|
|
|
|
2018-03-12 11:29:03 +05:30
|
|
|
// printf("info: set device to %d\n", p_gpuDevice);
|
2016-05-23 12:11:26 +08:00
|
|
|
HIPCHECK(hipSetDevice(p_gpuDevice));
|
|
|
|
|
|
|
|
|
|
testExternShared<float>(1024, 4);
|
|
|
|
|
testExternShared<float>(1024, 8);
|
|
|
|
|
testExternShared<float>(1024, 16);
|
|
|
|
|
testExternShared<float>(1024, 32);
|
|
|
|
|
testExternShared<float>(1024, 64);
|
|
|
|
|
|
|
|
|
|
testExternShared<float>(65536, 4);
|
|
|
|
|
testExternShared<float>(65536, 8);
|
|
|
|
|
testExternShared<float>(65536, 16);
|
|
|
|
|
testExternShared<float>(65536, 32);
|
|
|
|
|
testExternShared<float>(65536, 64);
|
|
|
|
|
|
|
|
|
|
testExternShared<double>(1024, 4);
|
|
|
|
|
testExternShared<double>(1024, 8);
|
|
|
|
|
testExternShared<double>(1024, 16);
|
|
|
|
|
testExternShared<double>(1024, 32);
|
|
|
|
|
testExternShared<double>(1024, 64);
|
|
|
|
|
|
|
|
|
|
testExternShared<double>(65536, 4);
|
|
|
|
|
testExternShared<double>(65536, 8);
|
|
|
|
|
testExternShared<double>(65536, 16);
|
|
|
|
|
testExternShared<double>(65536, 32);
|
|
|
|
|
testExternShared<double>(65536, 64);
|
|
|
|
|
|
|
|
|
|
passed();
|
|
|
|
|
}
|