Use native support for atomic FADD when address is in LDS (#1591)
This commit is contained in:
zatwierdzone przez
Maneesh Gupta
rodzic
d4dde7a27d
commit
2ed3a0873c
@@ -55,7 +55,7 @@ unsigned long long atomicAdd(
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
float atomicAdd(float* address, float val)
|
||||
float atomicAdd_impl(float* address, float val)
|
||||
{
|
||||
unsigned int* uaddr{reinterpret_cast<unsigned int*>(address)};
|
||||
unsigned int r{__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
|
||||
@@ -73,6 +73,37 @@ float atomicAdd(float* address, float val)
|
||||
|
||||
return __uint_as_float(r);
|
||||
}
|
||||
#if !__has_builtin(__builtin_amdgcn_is_shared)
|
||||
__device__
|
||||
inline
|
||||
bool __builtin_amdgcn_is_shared(
|
||||
const __attribute__((address_space(0))) void* ptr) noexcept
|
||||
{
|
||||
#if defined(__HIP_DEVICE_COMPILE__)
|
||||
const unsigned int gp = reinterpret_cast<unsigned long long>(ptr);
|
||||
|
||||
return gp ==
|
||||
(__builtin_amdgcn_s_getreg((15 << 11) | (16 << 6) | 15) << 16);
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
__device__
|
||||
inline
|
||||
float atomicAdd(float* address, float val)
|
||||
{
|
||||
using GP = const __attribute__((address_space(0))) void*;
|
||||
using LP = __attribute__((address_space(3))) float*;
|
||||
|
||||
#if __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__
|
||||
if (__builtin_amdgcn_is_shared((GP) address)) {
|
||||
return __builtin_amdgcn_ds_faddf((LP) address, val, 0, 0, false);
|
||||
}
|
||||
#endif
|
||||
|
||||
return atomicAdd_impl(address, val);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
double atomicAdd(double* address, double val)
|
||||
|
||||
@@ -29,6 +29,7 @@ THE SOFTWARE.
|
||||
|
||||
// includes, system
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
@@ -212,7 +213,7 @@ __device__
|
||||
void testKernelSub(...) {}
|
||||
|
||||
template<
|
||||
typename T,
|
||||
typename T,
|
||||
typename enable_if<
|
||||
is_same<T, int>{} || is_same<T, unsigned int>{}>::type* = nullptr>
|
||||
__device__
|
||||
@@ -259,13 +260,37 @@ void testKernelIntegral(T* g_odata) {
|
||||
testKernelSub(g_odata);
|
||||
}
|
||||
|
||||
namespace {
|
||||
constexpr unsigned int numData = 11;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__global__ void testKernel(T* g_odata) {
|
||||
__global__ void testKernel(T* g_odata, T* g_sdata) {
|
||||
// Atomic addition
|
||||
atomicAdd(&g_odata[0], 10);
|
||||
|
||||
testKernelIntegral(g_odata);
|
||||
testKernelExch(g_odata);
|
||||
|
||||
#if !defined(HIP_PLATFORM_NVCC)
|
||||
// Shared Atomic addition.
|
||||
__shared__ T s_odata[numData];
|
||||
|
||||
if (threadIdx.x == 0) { s_odata[8] = s_odata[10] = 0xff; }
|
||||
|
||||
__syncthreads();
|
||||
|
||||
atomicAdd(&s_odata[0], 10);
|
||||
|
||||
testKernelIntegral(s_odata);
|
||||
testKernelExch(s_odata);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
__builtin_memcpy(g_sdata, s_odata, sizeof(T) * numData);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
@@ -286,33 +311,38 @@ void runTest() {
|
||||
|
||||
unsigned int numThreads = 256;
|
||||
unsigned int numBlocks = 64;
|
||||
unsigned int numData = 11;
|
||||
unsigned int memSize = sizeof(T) * numData;
|
||||
|
||||
// allocate mem for the result on host side
|
||||
T* hOData = (T*)malloc(memSize);
|
||||
T* hSData = (T*)malloc(memSize);
|
||||
|
||||
// initialize the memory
|
||||
for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;
|
||||
std::memset(hOData, 0, memSize);
|
||||
std::memset(hSData, 0, memSize);
|
||||
|
||||
// To make the AND and XOR tests generate something other than 0...
|
||||
hOData[8] = hOData[10] = 0xff;
|
||||
|
||||
// allocate device memory for result
|
||||
T* dOData;
|
||||
T* dSData;
|
||||
hipMalloc((void**)&dOData, memSize);
|
||||
hipMalloc((void**)&dSData, memSize);
|
||||
// copy host memory to device to initialize to zero
|
||||
hipMemcpy(dOData, hOData, memSize, hipMemcpyHostToDevice);
|
||||
|
||||
// execute the kernel
|
||||
hipLaunchKernelGGL(
|
||||
testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData);
|
||||
testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData, dSData);
|
||||
|
||||
// Copy result from device to host
|
||||
hipMemcpy(hOData, dOData, memSize, hipMemcpyDeviceToHost);
|
||||
hipMemcpy(hSData, dSData, memSize, hipMemcpyDeviceToHost);
|
||||
|
||||
// Compute reference solution
|
||||
testResult = computeGold(hOData, numThreads * numBlocks);
|
||||
testResult = std::equal(hOData, hOData + numData, hSData) &&
|
||||
computeGold(hOData, numThreads * numBlocks);
|
||||
|
||||
// Cleanup memory
|
||||
free(hOData);
|
||||
|
||||
Reference in New Issue
Block a user