diff --git a/bin/hipify b/bin/hipify index af7c7edce7..ce934dff15 100755 --- a/bin/hipify +++ b/bin/hipify @@ -84,7 +84,7 @@ push (@warn_whitelist, split(',',$warn_whitelist)); #--- #Stats tracking code: -@statNames = ("dev", "mem", "kern", 'coord_func', "math_func", "special_func", "stream", "event", "err", "def", "tex", "other"); +@statNames = ("dev", "mem", "kern", 'coord_func', "math_func", "special_func", "stream", "event", "err", "def", "tex", "extern_shared", "other"); #--- @@ -428,6 +428,34 @@ while (@ARGV) { $countKeywords += m/__global__/; $countKeywords += m/__shared__/; + #-------- + # CUDA extern __shared__ syntax + # Note these only work if declaration is on a single line. + { + # match uses ? for <.*> which will be unitialized if this is not present in launch syntax. + no warnings qw/uninitialized/; + + my $k = 0; + + # Match extern __shared__ type foo[]; syntax + # Replace as HIP_DYNAMIC_SHARED() macro + $k += s/extern\s+([\w\(\)]+)?\s*__shared__\s+([\w:<>\s]+)\s+(\w+)\s*\[\s*\]\s*;/HIP_DYNAMIC_SHARED($1 $2, $3)/g; + + # test patterns for the regular expression above: + #'extern __shared__ double foo[];' + #'extern __shared__ unsigned int foo[];' + #'extern volatile __shared__ double foo[];' + #'extern volatile __shared__ unsigned int sdata[];' + #'extern __shared__ volatile unsigned int sdata[];' + #'extern __shared__ T s[];' + #'extern __shared__ T::type s[];' + #'extern __shared__ blah::type s[];' + #'extern __shared__ typename mapper::type s_data[];' + #'extern __attribute__((used)) __shared__ typename mapper::type s_data[];' + + $ft{'extern_shared'} += $k; + } + #-------- # CUDA Launch Syntax # Note these only work if launch is on a single line. diff --git a/include/hcc_detail/hip_runtime.h b/include/hcc_detail/hip_runtime.h index 0d70eaa2a4..eb518cc88d 100644 --- a/include/hcc_detail/hip_runtime.h +++ b/include/hcc_detail/hip_runtime.h @@ -430,6 +430,8 @@ __device__ float __shfl_xor(float input, int lane_mask, int width); __host__ __device__ int min(int arg1, int arg2); __host__ __device__ int max(int arg1, int arg2); +__device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr(); + //TODO - add a couple fast math operations here, the set here will grow : __device__ float __cosf(float x); __device__ float __expf(float x); @@ -556,6 +558,16 @@ do {\ #endif +/** + * extern __shared__ + */ + +// Macro to replace extern __shared__ declarations +// to local variable definitions +#define HIP_DYNAMIC_SHARED(type, var) \ + __attribute__((address_space(3))) type* var = \ + (__attribute__((address_space(3))) type*)__get_dynamicgroupbaseptr(); \ + #endif // __HCC__ diff --git a/include/nvcc_detail/hip_runtime.h b/include/nvcc_detail/hip_runtime.h index cb1253fdf1..06c6ffb9b3 100644 --- a/include/nvcc_detail/hip_runtime.h +++ b/include/nvcc_detail/hip_runtime.h @@ -95,6 +95,13 @@ kernelName<<>>(0, __VA_ARGS__);\ #define hipGridDim_y gridDim.y #define hipGridDim_z gridDim.z +/** + * extern __shared__ + */ + +#define HIP_DYNAMIC_SHARED(type, var) \ + extern __shared__ type var[]; \ + #endif diff --git a/src/device_util.cpp b/src/device_util.cpp index 3234408e50..11686c030c 100644 --- a/src/device_util.cpp +++ b/src/device_util.cpp @@ -808,6 +808,11 @@ __host__ __device__ int max(int arg1, int arg2) return (int)(hc::precise_math::fmax((float)arg1, (float)arg2)); } +__device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr() +{ + return hc::get_dynamic_group_segment_base_pointer(); +} + //TODO - add a couple fast math operations here, the set here will grow : diff --git a/tests/src/CMakeLists.txt b/tests/src/CMakeLists.txt index 111d945464..e0894df5fa 100644 --- a/tests/src/CMakeLists.txt +++ b/tests/src/CMakeLists.txt @@ -191,6 +191,7 @@ make_hip_executable (hipPeerToPeer_simple hipPeerToPeer_simple.cpp) make_hip_executable (hipMemcpyAll hipMemcpyAll.cpp) make_hip_executable (hipMultiThreadDevice hipMultiThreadDevice.cpp) make_hip_executable (hipTestMemcpyPin hipTestMemcpyPin.cpp) +make_hip_executable (hipDynamicShared hipDynamicShared.cpp) make_test(hip_ballot " " ) make_test(hip_anyall " " ) @@ -251,3 +252,5 @@ if (${HIP_PLATFORM} STREQUAL "hcc") endif() make_hipify_test(specialFunc.cu ) + +make_test(hipDynamicShared " ") diff --git a/tests/src/hipDynamicShared.cpp b/tests/src/hipDynamicShared.cpp new file mode 100644 index 0000000000..329529281c --- /dev/null +++ b/tests/src/hipDynamicShared.cpp @@ -0,0 +1,138 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include "test_common.h" + +template +__global__ void testExternSharedKernel(hipLaunchParm lp, const T* A_d, const T* B_d, T* C_d, size_t numElements, size_t groupElements) { + + // declare dynamic shared memory + HIP_DYNAMIC_SHARED(T, sdata) + + size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t tid = hipThreadIdx_x; + + // initialize dynamic shared memory + if (tid < groupElements) { + sdata[tid] = static_cast(tid); + } + + // prefix sum inside dynamic shared memory + if (groupElements >= 512) { + if (tid >= 256) { sdata[tid] += sdata[tid - 256]; } __syncthreads(); + } + if (groupElements >= 256) { + if (tid >= 128) { sdata[tid] += sdata[tid - 128]; } __syncthreads(); + } + if (groupElements >= 128) { + if (tid >= 64) { sdata[tid] += sdata[tid - 64]; } __syncthreads(); + } + if (groupElements >= 64) { sdata[tid] += sdata[tid - 32]; } __syncthreads(); + if (groupElements >= 32) { sdata[tid] += sdata[tid - 16]; } __syncthreads(); + if (groupElements >= 16) { sdata[tid] += sdata[tid - 8]; } __syncthreads(); + if (groupElements >= 8) { sdata[tid] += sdata[tid - 4]; } __syncthreads(); + if (groupElements >= 4) { sdata[tid] += sdata[tid - 2]; } __syncthreads(); + if (groupElements >= 2) { sdata[tid] += sdata[tid - 1]; } __syncthreads(); + + C_d[gid] = A_d[gid] + B_d[gid] + sdata[tid % groupElements]; +} + +template +void testExternShared(size_t N, size_t groupElements) { + size_t Nbytes = N * sizeof(T); + + T *A_d, *B_d, *C_d; + T *A_h, *B_h, *C_h; + + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + + //printf("blocksPerCU: %d\nthreadsPerBlock: %d\nN: %zu\n", blocksPerCU, threadsPerBlock, N); + + HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + + // calculate the amount of dynamic shared memory required + size_t groupMemBytes = groupElements * sizeof(T); + + // launch kernel with dynamic shared memory + hipLaunchKernel(HIP_KERNEL_NAME(testExternSharedKernel), dim3(blocks), dim3(threadsPerBlock), groupMemBytes, 0, A_d, B_d, C_d, N, groupElements); + + HIPCHECK(hipDeviceSynchronize()); + + HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + + // verify + for (size_t i = 0; i < N; ++i) { + size_t tid = (i % groupElements); + T sumFromSharedMemory = static_cast(tid * (tid + 1) / 2); + T expected = A_h[i] + B_h[i] + sumFromSharedMemory; + if (C_h[i] != expected) { + std::cout << std::fixed << std::setprecision(32); + std::cout << "At " << i << std::endl; + std::cout << " Computed:" << C_h[i] << std::endl; + std::cout << " Expected:" << expected << std::endl; + std::cout << sumFromSharedMemory << std::endl; + std::cout << A_h[i] << std::endl; + std::cout << B_h[i] << std::endl; + + failed("Failed at index:%zu\n", i); + } + } + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); +} + +int main(int argc, char *argv[]) { + HipTest::parseStandardArguments(argc, argv, true); + + //printf("info: set device to %d\n", p_gpuDevice); + HIPCHECK(hipSetDevice(p_gpuDevice)); + + testExternShared(1024, 4); + testExternShared(1024, 8); + testExternShared(1024, 16); + testExternShared(1024, 32); + testExternShared(1024, 64); + + testExternShared(65536, 4); + testExternShared(65536, 8); + testExternShared(65536, 16); + testExternShared(65536, 32); + testExternShared(65536, 64); + + testExternShared(1024, 4); + testExternShared(1024, 8); + testExternShared(1024, 16); + testExternShared(1024, 32); + testExternShared(1024, 64); + + testExternShared(65536, 4); + testExternShared(65536, 8); + testExternShared(65536, 16); + testExternShared(65536, 32); + testExternShared(65536, 64); + + passed(); +} +