From 2ed3a0873cbcfa1e2b7ac3a83ccc3aa31154fcda Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Fri, 22 Nov 2019 02:23:48 +0000 Subject: [PATCH] Use native support for atomic FADD when address is in LDS (#1591) --- hipamd/include/hip/hcc_detail/hip_atomic.h | 33 ++++++++++++++- .../src/deviceLib/hipSimpleAtomicsTest.cpp | 42 ++++++++++++++++--- 2 files changed, 68 insertions(+), 7 deletions(-) diff --git a/hipamd/include/hip/hcc_detail/hip_atomic.h b/hipamd/include/hip/hcc_detail/hip_atomic.h index 263f639e96..c8dcf02322 100644 --- a/hipamd/include/hip/hcc_detail/hip_atomic.h +++ b/hipamd/include/hip/hcc_detail/hip_atomic.h @@ -55,7 +55,7 @@ unsigned long long atomicAdd( } __device__ inline -float atomicAdd(float* address, float val) +float atomicAdd_impl(float* address, float val) { unsigned int* uaddr{reinterpret_cast(address)}; unsigned int r{__atomic_load_n(uaddr, __ATOMIC_RELAXED)}; @@ -73,6 +73,37 @@ float atomicAdd(float* address, float val) return __uint_as_float(r); } +#if !__has_builtin(__builtin_amdgcn_is_shared) + __device__ + inline + bool __builtin_amdgcn_is_shared( + const __attribute__((address_space(0))) void* ptr) noexcept + { + #if defined(__HIP_DEVICE_COMPILE__) + const unsigned int gp = reinterpret_cast(ptr); + + return gp == + (__builtin_amdgcn_s_getreg((15 << 11) | (16 << 6) | 15) << 16); + #else + return false; + #endif + } +#endif +__device__ +inline +float atomicAdd(float* address, float val) +{ + using GP = const __attribute__((address_space(0))) void*; + using LP = __attribute__((address_space(3))) float*; + + #if __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__ + if (__builtin_amdgcn_is_shared((GP) address)) { + return __builtin_amdgcn_ds_faddf((LP) address, val, 0, 0, false); + } + #endif + + return atomicAdd_impl(address, val); +} __device__ inline double atomicAdd(double* address, double val) diff --git a/hipamd/tests/src/deviceLib/hipSimpleAtomicsTest.cpp b/hipamd/tests/src/deviceLib/hipSimpleAtomicsTest.cpp index 760d65e555..5a09799c0e 100644 --- a/hipamd/tests/src/deviceLib/hipSimpleAtomicsTest.cpp +++ b/hipamd/tests/src/deviceLib/hipSimpleAtomicsTest.cpp @@ -29,6 +29,7 @@ THE SOFTWARE. // includes, system #include +#include #include #include #include @@ -212,7 +213,7 @@ __device__ void testKernelSub(...) {} template< - typename T, + typename T, typename enable_if< is_same{} || is_same{}>::type* = nullptr> __device__ @@ -259,13 +260,37 @@ void testKernelIntegral(T* g_odata) { testKernelSub(g_odata); } +namespace { + constexpr unsigned int numData = 11; +} + template -__global__ void testKernel(T* g_odata) { +__global__ void testKernel(T* g_odata, T* g_sdata) { // Atomic addition atomicAdd(&g_odata[0], 10); testKernelIntegral(g_odata); testKernelExch(g_odata); + + #if !defined(HIP_PLATFORM_NVCC) + // Shared Atomic addition. + __shared__ T s_odata[numData]; + + if (threadIdx.x == 0) { s_odata[8] = s_odata[10] = 0xff; } + + __syncthreads(); + + atomicAdd(&s_odata[0], 10); + + testKernelIntegral(s_odata); + testKernelExch(s_odata); + + __syncthreads(); + + if (threadIdx.x == 0) { + __builtin_memcpy(g_sdata, s_odata, sizeof(T) * numData); + } + #endif } template @@ -286,33 +311,38 @@ void runTest() { unsigned int numThreads = 256; unsigned int numBlocks = 64; - unsigned int numData = 11; unsigned int memSize = sizeof(T) * numData; // allocate mem for the result on host side T* hOData = (T*)malloc(memSize); + T* hSData = (T*)malloc(memSize); // initialize the memory - for (unsigned int i = 0; i < numData; i++) hOData[i] = 0; + std::memset(hOData, 0, memSize); + std::memset(hSData, 0, memSize); // To make the AND and XOR tests generate something other than 0... hOData[8] = hOData[10] = 0xff; // allocate device memory for result T* dOData; + T* dSData; hipMalloc((void**)&dOData, memSize); + hipMalloc((void**)&dSData, memSize); // copy host memory to device to initialize to zero hipMemcpy(dOData, hOData, memSize, hipMemcpyHostToDevice); // execute the kernel hipLaunchKernelGGL( - testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData); + testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData, dSData); // Copy result from device to host hipMemcpy(hOData, dOData, memSize, hipMemcpyDeviceToHost); + hipMemcpy(hSData, dSData, memSize, hipMemcpyDeviceToHost); // Compute reference solution - testResult = computeGold(hOData, numThreads * numBlocks); + testResult = std::equal(hOData, hOData + numData, hSData) && + computeGold(hOData, numThreads * numBlocks); // Cleanup memory free(hOData);