Use native support for atomic FADD when address is in LDS (#1591)

This commit is contained in:
Alex Voicu
2019-11-22 02:23:48 +00:00
zatwierdzone przez Maneesh Gupta
rodzic d4dde7a27d
commit 2ed3a0873c
2 zmienionych plików z 68 dodań i 7 usunięć
@@ -55,7 +55,7 @@ unsigned long long atomicAdd(
}
__device__
inline
float atomicAdd(float* address, float val)
float atomicAdd_impl(float* address, float val)
{
unsigned int* uaddr{reinterpret_cast<unsigned int*>(address)};
unsigned int r{__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
@@ -73,6 +73,37 @@ float atomicAdd(float* address, float val)
return __uint_as_float(r);
}
#if !__has_builtin(__builtin_amdgcn_is_shared)
__device__
inline
bool __builtin_amdgcn_is_shared(
const __attribute__((address_space(0))) void* ptr) noexcept
{
#if defined(__HIP_DEVICE_COMPILE__)
const unsigned int gp = reinterpret_cast<unsigned long long>(ptr);
return gp ==
(__builtin_amdgcn_s_getreg((15 << 11) | (16 << 6) | 15) << 16);
#else
return false;
#endif
}
#endif
__device__
inline
float atomicAdd(float* address, float val)
{
using GP = const __attribute__((address_space(0))) void*;
using LP = __attribute__((address_space(3))) float*;
#if __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__
if (__builtin_amdgcn_is_shared((GP) address)) {
return __builtin_amdgcn_ds_faddf((LP) address, val, 0, 0, false);
}
#endif
return atomicAdd_impl(address, val);
}
__device__
inline
double atomicAdd(double* address, double val)
@@ -29,6 +29,7 @@ THE SOFTWARE.
// includes, system
#include <algorithm>
#include <cstring>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
@@ -212,7 +213,7 @@ __device__
void testKernelSub(...) {}
template<
typename T,
typename T,
typename enable_if<
is_same<T, int>{} || is_same<T, unsigned int>{}>::type* = nullptr>
__device__
@@ -259,13 +260,37 @@ void testKernelIntegral(T* g_odata) {
testKernelSub(g_odata);
}
namespace {
constexpr unsigned int numData = 11;
}
template<typename T>
__global__ void testKernel(T* g_odata) {
__global__ void testKernel(T* g_odata, T* g_sdata) {
// Atomic addition
atomicAdd(&g_odata[0], 10);
testKernelIntegral(g_odata);
testKernelExch(g_odata);
#if !defined(HIP_PLATFORM_NVCC)
// Shared Atomic addition.
__shared__ T s_odata[numData];
if (threadIdx.x == 0) { s_odata[8] = s_odata[10] = 0xff; }
__syncthreads();
atomicAdd(&s_odata[0], 10);
testKernelIntegral(s_odata);
testKernelExch(s_odata);
__syncthreads();
if (threadIdx.x == 0) {
__builtin_memcpy(g_sdata, s_odata, sizeof(T) * numData);
}
#endif
}
template<typename T>
@@ -286,33 +311,38 @@ void runTest() {
unsigned int numThreads = 256;
unsigned int numBlocks = 64;
unsigned int numData = 11;
unsigned int memSize = sizeof(T) * numData;
// allocate mem for the result on host side
T* hOData = (T*)malloc(memSize);
T* hSData = (T*)malloc(memSize);
// initialize the memory
for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;
std::memset(hOData, 0, memSize);
std::memset(hSData, 0, memSize);
// To make the AND and XOR tests generate something other than 0...
hOData[8] = hOData[10] = 0xff;
// allocate device memory for result
T* dOData;
T* dSData;
hipMalloc((void**)&dOData, memSize);
hipMalloc((void**)&dSData, memSize);
// copy host memory to device to initialize to zero
hipMemcpy(dOData, hOData, memSize, hipMemcpyHostToDevice);
// execute the kernel
hipLaunchKernelGGL(
testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData);
testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData, dSData);
// Copy result from device to host
hipMemcpy(hOData, dOData, memSize, hipMemcpyDeviceToHost);
hipMemcpy(hSData, dSData, memSize, hipMemcpyDeviceToHost);
// Compute reference solution
testResult = computeGold(hOData, numThreads * numBlocks);
testResult = std::equal(hOData, hOData + numData, hSData) &&
computeGold(hOData, numThreads * numBlocks);
// Cleanup memory
free(hOData);