From bf4ba9e09e7dba68fa1b2d4b7c1b5afe57ade87b Mon Sep 17 00:00:00 2001 From: Jack Chung Date: Mon, 23 May 2016 12:11:26 +0800 Subject: [PATCH] Squashed commit of the following: commit 9548493fa754b3bf5c31cbdc2211db1e73e8c07c Author: Jack Chung Date: Mon May 23 11:57:23 2016 +0800 Rename hipExternShared test to hipDynamicShared Change-Id: I180d9d539420fb69cfc121eceaa7db9da03483b2 commit 827081f8244a38f010789d556db0c4ff7b6422d8 Author: Jack Chung Date: Mon May 23 11:56:27 2016 +0800 Rename HIP_DECLARE_EXTERN_SHARED to HIP_DYNAMIC_SHARED Change-Id: I22362d179812ac547e0f11ba4e2bb999050e08ae commit 4c277228ed41af187739610fa17eab1fb144c947 Author: Jack Chung Date: Thu May 19 17:49:52 2016 +0800 Adopt new interface to get dynamic LDS in hc.hpp Change-Id: I47b433b714633a4c97df87c40a0b1d3386429a00 commit 5a36117d777064113a528dc47b42e8c8413baa97 Author: Jack Chung Date: Thu May 19 11:29:24 2016 +0800 Add test patterns for regular expression to match "extern __shared__" These test patterns should better be saved as an individual test case, but I'm not familiar with HIP test structures so I leave them as comments in hipify as of now. Change-Id: I7fee89c89b9e73de2133357a226ec0c769733531 commit 1b26284168c7f5339f63338fd0149bed5d994656 Author: Jack Chung Date: Thu May 19 11:25:23 2016 +0800 Add one HIP unit test to use HIP_DECLARE_EXTERN_SHARED Change-Id: I4d9907815920693a74ea9d575fe26e7c67636109 commit 77b816ee5972b13d829d5bbcf06fbfd07acea2af Author: Jack Chung Date: Wed May 18 19:18:59 2016 +0800 Adopt HIP_ prefix for DECLARE_EXTERN_SHARED macro Change-Id: I555ded16b449b67d2e20904013d86fe1ded6a2be commit ef0997939c3578a9ae11621bf21c0416f04d2622 Author: Jack Chung Date: Wed May 18 17:42:04 2016 +0800 Modify hipify to support converting extern __shared__ to DECLARE_EXTERN_SHARED macro Added regular expression to search & replace extern __shared__ declarations to DECLARE_EXTERN_SHARED macro. Limitation: - Won't work if "extern __shared__" is declared at global scope Sample Usages: extern __shared__ double foo[]; extern __shared__ unsigned int foo[]; extern volatile __shared__ double foo[]; extern volatile __shared__ unsigned int sdata[]; extern __shared__ volatile unsigned int sdata[]; extern __shared__ T s[]; extern __shared__ T::type s[]; extern __shared__ blah::type s[]; extern __shared__ typename mapper::type s_data[]; extern __attribute__((used)) __shared__ typename mapper::type s_data[]; Change-Id: I2be0b7039adeddb789f5a2b067d403a43fdc3e26 commit 93ff268724493aedfacdcd5a5aa9a100f4ebaed0 Author: Jack Chung Date: Wed May 18 15:13:09 2016 +0800 Introduce DECLARE_EXTERN_SHARED macro to encapsulate "extern __shared__" decls Change-Id: I93b2d37c763195b0ca9fd0afee78605a1e3272db commit cff9c95412de343cc6405158b5acc4f1029267ff Author: Jack Chung Date: Wed May 18 12:53:54 2016 +0800 Add __get_dynamic_groupbaseptr() to point to dynamic LDS Change-Id: I97b548d8a691488057617c551a8f331cad7afc77 Change-Id: I84e7875b76fa1f59e860e19c93bd4209cdd1fd2c --- bin/hipify | 30 ++++++- include/hcc_detail/hip_runtime.h | 12 +++ include/nvcc_detail/hip_runtime.h | 7 ++ src/device_util.cpp | 5 ++ tests/src/CMakeLists.txt | 3 + tests/src/hipDynamicShared.cpp | 138 ++++++++++++++++++++++++++++++ 6 files changed, 194 insertions(+), 1 deletion(-) create mode 100644 tests/src/hipDynamicShared.cpp diff --git a/bin/hipify b/bin/hipify index af7c7edce7..ce934dff15 100755 --- a/bin/hipify +++ b/bin/hipify @@ -84,7 +84,7 @@ push (@warn_whitelist, split(',',$warn_whitelist)); #--- #Stats tracking code: -@statNames = ("dev", "mem", "kern", 'coord_func', "math_func", "special_func", "stream", "event", "err", "def", "tex", "other"); +@statNames = ("dev", "mem", "kern", 'coord_func', "math_func", "special_func", "stream", "event", "err", "def", "tex", "extern_shared", "other"); #--- @@ -428,6 +428,34 @@ while (@ARGV) { $countKeywords += m/__global__/; $countKeywords += m/__shared__/; + #-------- + # CUDA extern __shared__ syntax + # Note these only work if declaration is on a single line. + { + # match uses ? for <.*> which will be unitialized if this is not present in launch syntax. + no warnings qw/uninitialized/; + + my $k = 0; + + # Match extern __shared__ type foo[]; syntax + # Replace as HIP_DYNAMIC_SHARED() macro + $k += s/extern\s+([\w\(\)]+)?\s*__shared__\s+([\w:<>\s]+)\s+(\w+)\s*\[\s*\]\s*;/HIP_DYNAMIC_SHARED($1 $2, $3)/g; + + # test patterns for the regular expression above: + #'extern __shared__ double foo[];' + #'extern __shared__ unsigned int foo[];' + #'extern volatile __shared__ double foo[];' + #'extern volatile __shared__ unsigned int sdata[];' + #'extern __shared__ volatile unsigned int sdata[];' + #'extern __shared__ T s[];' + #'extern __shared__ T::type s[];' + #'extern __shared__ blah::type s[];' + #'extern __shared__ typename mapper::type s_data[];' + #'extern __attribute__((used)) __shared__ typename mapper::type s_data[];' + + $ft{'extern_shared'} += $k; + } + #-------- # CUDA Launch Syntax # Note these only work if launch is on a single line. diff --git a/include/hcc_detail/hip_runtime.h b/include/hcc_detail/hip_runtime.h index 0d70eaa2a4..eb518cc88d 100644 --- a/include/hcc_detail/hip_runtime.h +++ b/include/hcc_detail/hip_runtime.h @@ -430,6 +430,8 @@ __device__ float __shfl_xor(float input, int lane_mask, int width); __host__ __device__ int min(int arg1, int arg2); __host__ __device__ int max(int arg1, int arg2); +__device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr(); + //TODO - add a couple fast math operations here, the set here will grow : __device__ float __cosf(float x); __device__ float __expf(float x); @@ -556,6 +558,16 @@ do {\ #endif +/** + * extern __shared__ + */ + +// Macro to replace extern __shared__ declarations +// to local variable definitions +#define HIP_DYNAMIC_SHARED(type, var) \ + __attribute__((address_space(3))) type* var = \ + (__attribute__((address_space(3))) type*)__get_dynamicgroupbaseptr(); \ + #endif // __HCC__ diff --git a/include/nvcc_detail/hip_runtime.h b/include/nvcc_detail/hip_runtime.h index cb1253fdf1..06c6ffb9b3 100644 --- a/include/nvcc_detail/hip_runtime.h +++ b/include/nvcc_detail/hip_runtime.h @@ -95,6 +95,13 @@ kernelName<<>>(0, __VA_ARGS__);\ #define hipGridDim_y gridDim.y #define hipGridDim_z gridDim.z +/** + * extern __shared__ + */ + +#define HIP_DYNAMIC_SHARED(type, var) \ + extern __shared__ type var[]; \ + #endif diff --git a/src/device_util.cpp b/src/device_util.cpp index 3234408e50..11686c030c 100644 --- a/src/device_util.cpp +++ b/src/device_util.cpp @@ -808,6 +808,11 @@ __host__ __device__ int max(int arg1, int arg2) return (int)(hc::precise_math::fmax((float)arg1, (float)arg2)); } +__device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr() +{ + return hc::get_dynamic_group_segment_base_pointer(); +} + //TODO - add a couple fast math operations here, the set here will grow : diff --git a/tests/src/CMakeLists.txt b/tests/src/CMakeLists.txt index 111d945464..e0894df5fa 100644 --- a/tests/src/CMakeLists.txt +++ b/tests/src/CMakeLists.txt @@ -191,6 +191,7 @@ make_hip_executable (hipPeerToPeer_simple hipPeerToPeer_simple.cpp) make_hip_executable (hipMemcpyAll hipMemcpyAll.cpp) make_hip_executable (hipMultiThreadDevice hipMultiThreadDevice.cpp) make_hip_executable (hipTestMemcpyPin hipTestMemcpyPin.cpp) +make_hip_executable (hipDynamicShared hipDynamicShared.cpp) make_test(hip_ballot " " ) make_test(hip_anyall " " ) @@ -251,3 +252,5 @@ if (${HIP_PLATFORM} STREQUAL "hcc") endif() make_hipify_test(specialFunc.cu ) + +make_test(hipDynamicShared " ") diff --git a/tests/src/hipDynamicShared.cpp b/tests/src/hipDynamicShared.cpp new file mode 100644 index 0000000000..329529281c --- /dev/null +++ b/tests/src/hipDynamicShared.cpp @@ -0,0 +1,138 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include "test_common.h" + +template +__global__ void testExternSharedKernel(hipLaunchParm lp, const T* A_d, const T* B_d, T* C_d, size_t numElements, size_t groupElements) { + + // declare dynamic shared memory + HIP_DYNAMIC_SHARED(T, sdata) + + size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t tid = hipThreadIdx_x; + + // initialize dynamic shared memory + if (tid < groupElements) { + sdata[tid] = static_cast(tid); + } + + // prefix sum inside dynamic shared memory + if (groupElements >= 512) { + if (tid >= 256) { sdata[tid] += sdata[tid - 256]; } __syncthreads(); + } + if (groupElements >= 256) { + if (tid >= 128) { sdata[tid] += sdata[tid - 128]; } __syncthreads(); + } + if (groupElements >= 128) { + if (tid >= 64) { sdata[tid] += sdata[tid - 64]; } __syncthreads(); + } + if (groupElements >= 64) { sdata[tid] += sdata[tid - 32]; } __syncthreads(); + if (groupElements >= 32) { sdata[tid] += sdata[tid - 16]; } __syncthreads(); + if (groupElements >= 16) { sdata[tid] += sdata[tid - 8]; } __syncthreads(); + if (groupElements >= 8) { sdata[tid] += sdata[tid - 4]; } __syncthreads(); + if (groupElements >= 4) { sdata[tid] += sdata[tid - 2]; } __syncthreads(); + if (groupElements >= 2) { sdata[tid] += sdata[tid - 1]; } __syncthreads(); + + C_d[gid] = A_d[gid] + B_d[gid] + sdata[tid % groupElements]; +} + +template +void testExternShared(size_t N, size_t groupElements) { + size_t Nbytes = N * sizeof(T); + + T *A_d, *B_d, *C_d; + T *A_h, *B_h, *C_h; + + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + + //printf("blocksPerCU: %d\nthreadsPerBlock: %d\nN: %zu\n", blocksPerCU, threadsPerBlock, N); + + HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + + // calculate the amount of dynamic shared memory required + size_t groupMemBytes = groupElements * sizeof(T); + + // launch kernel with dynamic shared memory + hipLaunchKernel(HIP_KERNEL_NAME(testExternSharedKernel), dim3(blocks), dim3(threadsPerBlock), groupMemBytes, 0, A_d, B_d, C_d, N, groupElements); + + HIPCHECK(hipDeviceSynchronize()); + + HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + + // verify + for (size_t i = 0; i < N; ++i) { + size_t tid = (i % groupElements); + T sumFromSharedMemory = static_cast(tid * (tid + 1) / 2); + T expected = A_h[i] + B_h[i] + sumFromSharedMemory; + if (C_h[i] != expected) { + std::cout << std::fixed << std::setprecision(32); + std::cout << "At " << i << std::endl; + std::cout << " Computed:" << C_h[i] << std::endl; + std::cout << " Expected:" << expected << std::endl; + std::cout << sumFromSharedMemory << std::endl; + std::cout << A_h[i] << std::endl; + std::cout << B_h[i] << std::endl; + + failed("Failed at index:%zu\n", i); + } + } + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); +} + +int main(int argc, char *argv[]) { + HipTest::parseStandardArguments(argc, argv, true); + + //printf("info: set device to %d\n", p_gpuDevice); + HIPCHECK(hipSetDevice(p_gpuDevice)); + + testExternShared(1024, 4); + testExternShared(1024, 8); + testExternShared(1024, 16); + testExternShared(1024, 32); + testExternShared(1024, 64); + + testExternShared(65536, 4); + testExternShared(65536, 8); + testExternShared(65536, 16); + testExternShared(65536, 32); + testExternShared(65536, 64); + + testExternShared(1024, 4); + testExternShared(1024, 8); + testExternShared(1024, 16); + testExternShared(1024, 32); + testExternShared(1024, 64); + + testExternShared(65536, 4); + testExternShared(65536, 8); + testExternShared(65536, 16); + testExternShared(65536, 32); + testExternShared(65536, 64); + + passed(); +} +